diff --git a/Makefile b/Makefile index 9130b3bb..77d210d9 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ build-linux: @GOOS=linux GOARCH=amd64 go build --mod readonly $(BUILD_FLAGS) -o ./build/horcrux ./cmd/horcrux test: - @go test -timeout 20m -mod readonly -v ./... + @go test -race -timeout 20m -mod readonly -v ./... test-short: @go test -mod readonly -run TestDownedSigners2of3 -v ./... diff --git a/client/address.go b/client/address.go new file mode 100644 index 00000000..94315824 --- /dev/null +++ b/client/address.go @@ -0,0 +1,30 @@ +package client + +import ( + "fmt" + "net/url" + "strings" +) + +func SanitizeAddress(address string) (string, error) { + u, err := url.Parse(address) + if err != nil { + return "", fmt.Errorf("error parsing peer URL: %w", err) + } + + return u.Host, nil +} + +func MultiAddress(addresses []string) (string, error) { + grpcAddresses := make([]string, len(addresses)) + + for i, addr := range addresses { + peerAddr, err := SanitizeAddress(addr) + if err != nil { + return "", err + } + grpcAddresses[i] = peerAddr + } + + return fmt.Sprintf("multi:///%s", strings.Join(grpcAddresses, ",")), nil +} diff --git a/client/address_test.go b/client/address_test.go new file mode 100644 index 00000000..18b2e20f --- /dev/null +++ b/client/address_test.go @@ -0,0 +1,55 @@ +package client_test + +import ( + "testing" + + "github.com/strangelove-ventures/horcrux/client" + "github.com/stretchr/testify/require" +) + +func TestLeaderElectionMultiAddressDomain(t *testing.T) { + addresses := []string{ + "tcp://signer-1:2222", + "tcp://signer-2:2222", + "tcp://signer-3:2222", + } + + multiAddress, err := client.MultiAddress(addresses) + require.NoError(t, err, "failed to assemble fqdn multi address") + + require.Equal(t, "multi:///signer-1:2222,signer-2:2222,signer-3:2222", multiAddress) +} + +func TestLeaderElectionMultiAddressIPv4(t *testing.T) { + addresses := []string{ + "tcp://10.0.0.1:2222", + "tcp://10.0.0.2:2222", + "tcp://10.0.0.3:2222", + } + + multiAddress, err := client.MultiAddress(addresses) + require.NoError(t, err, "failed to assemble ipv4 multi address") + + require.Equal(t, "multi:///10.0.0.1:2222,10.0.0.2:2222,10.0.0.3:2222", multiAddress) +} + +func TestLeaderElectionMultiAddressIPv6(t *testing.T) { + addresses := []string{ + "tcp://[2001:db8:3333:4444:5555:6666:7777:8888]:2222", + "tcp://[::]:2222", + "tcp://[::1234:5678]:2222", + "tcp://[2001:db8::]:2222", + "tcp://[2001:db8::1234:5678]:2222", + } + + multiAddress, err := client.MultiAddress(addresses) + require.NoError(t, err, "failed to assemble ipv6 multi address") + + const expected = "multi:///" + + "[2001:db8:3333:4444:5555:6666:7777:8888]:2222" + + ",[::]:2222,[::1234:5678]:2222" + + ",[2001:db8::]:2222" + + ",[2001:db8::1234:5678]:2222" + + require.Equal(t, expected, multiAddress) +} diff --git a/cmd/horcrux/cmd/config.go b/cmd/horcrux/cmd/config.go index 6d2fa07e..776d6609 100644 --- a/cmd/horcrux/cmd/config.go +++ b/cmd/horcrux/cmd/config.go @@ -12,6 +12,7 @@ import ( "time" "github.com/spf13/cobra" + "github.com/strangelove-ventures/horcrux/client" "github.com/strangelove-ventures/horcrux/signer" tmlog "github.com/tendermint/tendermint/libs/log" "gopkg.in/yaml.v2" @@ -75,7 +76,9 @@ func initCmd() *cobra.Command { if keyFileFlag != "" { keyFile = &keyFileFlag } + debugAddr, _ := cmdFlags.GetString("debug-addr") if cs { + // Cosigner Config p, _ := cmdFlags.GetString("peers") threshold, _ := cmdFlags.GetUint8("threshold") timeout, _ := cmdFlags.GetString("timeout") @@ -111,11 +114,13 @@ func initCmd() *cobra.Command { Timeout: timeout, }, ChainNodes: cn, + DebugAddr: debugAddr, } if err = validateCosignerConfig(cfg); err != nil { return err } } else { + // Single Signer Config if len(cn) == 0 { return fmt.Errorf("must input at least one node") } @@ -123,6 +128,7 @@ func initCmd() *cobra.Command { PrivValKeyFile: keyFile, ChainID: cid, ChainNodes: cn, + DebugAddr: debugAddr, } if err = validateSingleSignerConfig(cfg); err != nil { return err @@ -163,6 +169,7 @@ func initCmd() *cobra.Command { "(i.e. \"tcp://node-1:2222|2,tcp://node-2:2222|3\")") cmd.Flags().Uint8P("threshold", "t", 0, "indicate number of signatures required for threshold signature") cmd.Flags().StringP("listen", "l", "", "listen address of the signer") + cmd.Flags().StringP("debug-addr", "d", "", "listen address for Debug and Prometheus metrics in format localhost:8543") cmd.Flags().StringP("keyfile", "k", "", "priv val key file path (full key for single signer, or key share for cosigner)") cmd.Flags().String("timeout", "1500ms", "configure cosigner rpc server timeout value, \n"+ @@ -191,9 +198,14 @@ func validateCosignerConfig(cfg DiskConfig) error { if cfg.CosignerConfig == nil { return fmt.Errorf("cosigner config can't be empty") } - if float32(len(cfg.CosignerConfig.Peers))/float32(2) >= float32(cfg.CosignerConfig.Threshold) { - return fmt.Errorf("the threshold, t = (%d) must be greater than, 'peers/2' = (%.1f)", - cfg.CosignerConfig.Threshold, float32(len(cfg.CosignerConfig.Peers))/2) + + if cfg.CosignerConfig.Threshold <= cfg.CosignerConfig.Shares/2 { + return fmt.Errorf("threshold (%d) must be greater than number of shares (%d) / 2", + cfg.CosignerConfig.Threshold, cfg.CosignerConfig.Shares) + } + if cfg.CosignerConfig.Shares < cfg.CosignerConfig.Threshold { + return fmt.Errorf("number of shares (%d) must be greater or equal to threshold (%d)", + cfg.CosignerConfig.Shares, cfg.CosignerConfig.Threshold) } _, err := time.ParseDuration(cfg.CosignerConfig.Timeout) @@ -329,6 +341,7 @@ func addPeersCmd() *cobra.Command { return errors.New("no new peer nodes in args") } diff = append(config.Config.CosignerConfig.Peers, diff...) + config.Config.CosignerConfig.Shares = len(diff) + 1 if err := validateCosignerPeers(diff, config.Config.CosignerConfig.Shares); err != nil { return err } @@ -372,6 +385,8 @@ func removePeersCmd() *cobra.Command { if len(diff) == 0 { return errors.New("cannot remove all peer nodes from config, please leave at least one") } + + config.Config.CosignerConfig.Shares = len(diff) + 1 // If none of the peer nodes in the args are listed in the config, just continue // without throwing an error, as the peer nodes in the config remain untouched. if err := validateCosignerPeers(diff, config.Config.CosignerConfig.Shares); err != nil { @@ -487,6 +502,7 @@ type DiskConfig struct { ChainID string `json:"chain-id" yaml:"chain-id"` CosignerConfig *CosignerConfig `json:"cosigner,omitempty" yaml:"cosigner,omitempty"` ChainNodes []ChainNode `json:"chain-nodes,omitempty" yaml:"chain-nodes,omitempty"` + DebugAddr string `json:"debug-addr,omitempty" yaml:"debug-addr,omitempty"` } func (c *DiskConfig) Nodes() []signer.NodeConfig { @@ -544,6 +560,15 @@ type CosignerConfig struct { SignerType string `json:"signer-type" yaml:"signer-type"` } +func (cfg *CosignerConfig) LeaderElectMultiAddress() (string, error) { + addresses := make([]string, 1+len(cfg.Peers)) + addresses[0] = cfg.P2PListen + for i, peer := range cfg.Peers { + addresses[i+1] = peer.P2PAddr + } + return client.MultiAddress(addresses) +} + func (c *DiskConfig) CosignerPeers() (out []signer.CosignerConfig) { for _, p := range c.CosignerConfig.Peers { out = append(out, signer.CosignerConfig{ID: p.ShareID, Address: p.P2PAddr}) @@ -599,11 +624,12 @@ func validateCosignerPeers(peers []CosignerPeer, shares uint8) error { } } - // Check that no more than {num-shares}-1 peers are in the peer list, assuming + // Check that exactly {num-shares}-1 peers are in the peer list, assuming // the remaining peer ID is the ID the local node is configured with. - if len(peers) == int(shares) { - return fmt.Errorf("too many peers (%v+local node = %v) for the specified number of key shares (%v)", - len(peers), len(peers)+1, shares) + + if len(peers) != shares-1 { + return fmt.Errorf("incorrect number of peers. expected (%d shares - local node = %d peers)", + shares, shares-1) } return nil } diff --git a/cmd/horcrux/cmd/config_test.go b/cmd/horcrux/cmd/config_test.go index 660620f3..f4b62583 100644 --- a/cmd/horcrux/cmd/config_test.go +++ b/cmd/horcrux/cmd/config_test.go @@ -16,7 +16,7 @@ const ( ) func TestConfigInitCmd(t *testing.T) { - tmpHome := "/tmp/TestConfigInitCmd" + tmpHome := t.TempDir() tcs := []struct { name string home string @@ -85,9 +85,8 @@ func TestConfigInitCmd(t *testing.T) { t.Run(tc.name, func(t *testing.T) { tmpConfig := filepath.Join(tc.home, ".horcrux") - err := os.Setenv("HOME", tc.home) - require.NoError(t, err) - err = os.MkdirAll(tc.home, 0777) + t.Setenv("HOME", tc.home) + err := os.MkdirAll(tc.home, 0777) require.NoError(t, err) cmd := initCmd() @@ -120,24 +119,10 @@ func TestConfigInitCmd(t *testing.T) { } }) } - - t.Cleanup(func() { - files, err := filepath.Glob(tmpHome + "*") - require.NoError(t, err) - - for _, file := range files { - os.RemoveAll(file) - } - }) } func TestConfigChainIDSetCmd(t *testing.T) { - tmpHome := "/tmp/TestConfigChainIDSetCmd" - - err := os.Setenv("HOME", tmpHome) - require.NoError(t, err) - err = os.MkdirAll(tmpHome, 0777) - require.NoError(t, err) + t.Setenv("HOME", t.TempDir()) cmd := initCmd() cmd.SetOutput(io.Discard) @@ -150,7 +135,7 @@ func TestConfigChainIDSetCmd(t *testing.T) { "-l", "tcp://10.168.1.1:2222", "--timeout", "1500ms", }) - err = cmd.Execute() + err := cmd.Execute() require.NoError(t, err) tcs := []struct { @@ -185,19 +170,10 @@ func TestConfigChainIDSetCmd(t *testing.T) { } }) } - - t.Cleanup(func() { - os.RemoveAll(tmpHome) - }) } func TestConfigNodesAddAndRemove(t *testing.T) { - tmpHome := "/tmp/TestConfigNodesAddAndRemove" - - err := os.Setenv("HOME", tmpHome) - require.NoError(t, err) - err = os.MkdirAll(tmpHome, 0777) - require.NoError(t, err) + t.Setenv("HOME", t.TempDir()) cmd := initCmd() cmd.SetOutput(io.Discard) @@ -210,7 +186,7 @@ func TestConfigNodesAddAndRemove(t *testing.T) { "-l", "tcp://10.168.1.1:2222", "--timeout", "1500ms", }) - err = cmd.Execute() + err := cmd.Execute() require.NoError(t, err) tcs := []struct { @@ -331,19 +307,10 @@ func TestConfigNodesAddAndRemove(t *testing.T) { require.Equal(t, tc.expectNodes, config.Config.ChainNodes) }) } - - t.Cleanup(func() { - os.RemoveAll(tmpHome) - }) } func TestConfigPeersAddAndRemove(t *testing.T) { - tmpHome := "/tmp/TestConfigPeersAddAndRemove" - - err := os.Setenv("HOME", tmpHome) - require.NoError(t, err) - err = os.MkdirAll(tmpHome, 0777) - require.NoError(t, err) + t.Setenv("HOME", t.TempDir()) cmd := initCmd() cmd.SetOutput(io.Discard) @@ -352,11 +319,11 @@ func TestConfigPeersAddAndRemove(t *testing.T) { "tcp://10.168.0.1:1234", "-c", "-p", "tcp://10.168.1.2:2222|2,tcp://10.168.1.3:2222|3,tcp://10.168.1.4:2222|4", - "-t", "2", + "-t", "3", "-l", "tcp://10.168.1.1:2222", "--timeout", "1500ms", }) - err = cmd.Execute() + err := cmd.Execute() require.NoError(t, err) tcs := []struct { @@ -453,7 +420,7 @@ func TestConfigPeersAddAndRemove(t *testing.T) { { name: "add peer with ID out of range", cmd: addPeersCmd(), - args: []string{"tcp://10.168.1.5:2222|5"}, + args: []string{"tcp://10.168.1.5:2222|6"}, expectPeers: []CosignerPeer{ {ShareID: 2, P2PAddr: "tcp://10.168.1.2:2222"}, {ShareID: 3, P2PAddr: "tcp://10.168.1.3:2222"}, @@ -478,10 +445,6 @@ func TestConfigPeersAddAndRemove(t *testing.T) { require.Equal(t, tc.expectPeers, config.Config.CosignerConfig.Peers) }) } - - t.Cleanup(func() { - os.RemoveAll(tmpHome) - }) } func TestDiffSetChainNode(t *testing.T) { @@ -617,12 +580,7 @@ func TestDiffSetCosignerPeer(t *testing.T) { } func TestSetShares(t *testing.T) { - tmpHome := "/tmp/TestSetShares" - - err := os.Setenv("HOME", tmpHome) - require.NoError(t, err) - err = os.MkdirAll(tmpHome, 0777) - require.NoError(t, err) + t.Setenv("HOME", t.TempDir()) cmd := initCmd() cmd.SetOutput(io.Discard) @@ -635,7 +593,7 @@ func TestSetShares(t *testing.T) { "-l", "tcp://10.168.1.1:2222", "--timeout", "1500ms", }) - err = cmd.Execute() + err := cmd.Execute() require.NoError(t, err) tcs := []struct { @@ -646,20 +604,20 @@ func TestSetShares(t *testing.T) { }{ // Do NOT change the order of the test cases! { name: "valid number of shares", - args: []string{"4"}, - expectShares: 4, + args: []string{"3"}, + expectShares: 3, expectErr: false, }, { name: "too few shares for number of peers", args: []string{"1"}, - expectShares: 4, + expectShares: 3, expectErr: true, }, { name: "invalid number of shares", args: []string{"-1"}, - expectShares: 4, + expectShares: 3, expectErr: true, }, } @@ -680,8 +638,4 @@ func TestSetShares(t *testing.T) { } }) } - - t.Cleanup(func() { - os.RemoveAll(tmpHome) - }) } diff --git a/cmd/horcrux/cmd/cosigner.go b/cmd/horcrux/cmd/cosigner.go index c0a27b05..7c4141cd 100644 --- a/cmd/horcrux/cmd/cosigner.go +++ b/cmd/horcrux/cmd/cosigner.go @@ -224,7 +224,9 @@ func StartCosignerCmd() *cobra.Command { } logger.Info("Signer", "address", pubkey.Address()) - services, err = signer.StartRemoteSigners(services, logger, config.Config.ChainID, pv, nodes) + go EnableDebugAndMetrics(cmd.Context()) + + services, err = signer.StartRemoteSigners(services, logger, cfg.ChainID, pv, cfg.Nodes) if err != nil { panic(err) } diff --git a/cmd/horcrux/cmd/key2shares.go b/cmd/horcrux/cmd/key2shares.go index 75adc88b..25226f10 100644 --- a/cmd/horcrux/cmd/key2shares.go +++ b/cmd/horcrux/cmd/key2shares.go @@ -36,10 +36,17 @@ func CreateCosignerSharesCmd() *cobra.Command { Args: validateCreateCosignerShares, Short: "Create cosigner shares", RunE: func(cmd *cobra.Command, args []string) (err error) { - threshold, _ := strconv.ParseInt(args[1], 10, 64) - numShares, _ := strconv.ParseInt(args[2], 10, 64) + threshold, shares := args[1], args[2] + t, err := strconv.ParseInt(threshold, 10, 64) + if err != nil { + return fmt.Errorf("error parsing threshold (%s): %w", threshold, err) + } + n, err := strconv.ParseInt(shares, 10, 64) + if err != nil { + return fmt.Errorf("error parsing shares (%s): %w", shares, err) + } - csKeys, err := signer.CreateCosignerSharesFromFile(args[0], threshold, numShares) + csKeys, err := signer.CreateCosignerSharesFromFile(args[0], t, n) if err != nil { return err } @@ -66,11 +73,21 @@ func validateCreateCosignerShares(cmd *cobra.Command, args []string) error { if !os.FileExists(args[0]) { return fmt.Errorf("priv_validator.json file(%s) doesn't exist", args[0]) } - if _, err := strconv.ParseInt(args[1], 10, 64); err != nil { - return fmt.Errorf("shards must be an integer got(%s)", args[1]) + threshold, shares := args[1], args[2] + t, err := strconv.ParseInt(threshold, 10, 64) + if err != nil { + return fmt.Errorf("error parsing threshold (%s): %w", threshold, err) + } + n, err := strconv.ParseInt(shares, 10, 64) + if err != nil { + return fmt.Errorf("error parsing shares (%s): %w", shares, err) + } + if t > n { + return fmt.Errorf("threshold cannot be greater than total shares, got [threshold](%d) > [shares](%d)", t, n) } - if _, err := strconv.ParseInt(args[2], 10, 64); err != nil { - return fmt.Errorf("threshold must be an integer got(%s)", args[2]) + if t <= n/2 { + return fmt.Errorf("threshold must be greater than total shares "+ + "divided by 2, got [threshold](%d) <= [shares](%d) / 2", t, n) } return nil } diff --git a/cmd/horcrux/cmd/key2shares_test.go b/cmd/horcrux/cmd/key2shares_test.go new file mode 100644 index 00000000..4853a163 --- /dev/null +++ b/cmd/horcrux/cmd/key2shares_test.go @@ -0,0 +1,71 @@ +package cmd + +import ( + "io" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" + "github.com/tendermint/tendermint/crypto/ed25519" + "github.com/tendermint/tendermint/privval" +) + +func TestKey2Shares(t *testing.T) { + tmp := t.TempDir() + + privValidatorKeyFile := filepath.Join(tmp, "priv_validator_key.json") + privValidatorStateFile := filepath.Join(tmp, "priv_validator_state.json") + pv := privval.NewFilePV(ed25519.GenPrivKey(), privValidatorKeyFile, privValidatorStateFile) + pv.Save() + + tcs := []struct { + name string + args []string + expectErr bool + }{ + { + name: "valid threshold and shares", + args: []string{privValidatorKeyFile, "2", "3"}, + expectErr: false, + }, + { + name: "valid threshold and shares 2", + args: []string{privValidatorKeyFile, "3", "5"}, + expectErr: false, + }, + { + name: "threshold exactly half of shares", + args: []string{privValidatorKeyFile, "2", "4"}, + expectErr: true, + }, + { + name: "threshold less than half of shares", + args: []string{privValidatorKeyFile, "1", "3"}, + expectErr: true, + }, + { + name: "threshold exceeds shares", + args: []string{privValidatorKeyFile, "4", "3"}, + expectErr: true, + }, + { + name: "non-numeric threshold and shares", + args: []string{privValidatorKeyFile, "two", "three"}, + expectErr: true, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + cmd := CreateCosignerSharesCmd() + cmd.SetOutput(io.Discard) + cmd.SetArgs(tc.args) + err := cmd.Execute() + if tc.expectErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/cmd/horcrux/cmd/leader_election.go b/cmd/horcrux/cmd/leader_election.go index bacdd4e9..68163f6b 100644 --- a/cmd/horcrux/cmd/leader_election.go +++ b/cmd/horcrux/cmd/leader_election.go @@ -4,22 +4,20 @@ import ( "context" "fmt" "log" - "net" - "net/url" - "strings" "time" _ "github.com/Jille/grpc-multi-resolver" - "github.com/strangelove-ventures/horcrux/signer/proto" - grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry" "github.com/spf13/cobra" + "github.com/strangelove-ventures/horcrux/client" + "github.com/strangelove-ventures/horcrux/signer/proto" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" ) func init() { rootCmd.AddCommand(leaderElectionCmd) + rootCmd.AddCommand(getLeaderCmd) } var leaderElectionCmd = &cobra.Command{ @@ -47,32 +45,12 @@ horcrux elect 2 # elect specific leader`, grpc_retry.WithMax(5), } - var grpcAddresses []string - u, err := url.Parse(config.Config.CosignerConfig.P2PListen) + grpcAddress, err := config.Config.CosignerConfig.LeaderElectMultiAddress() if err != nil { - fmt.Printf("Error parsing peer URL: %v", err) - } else { - host, port, err := net.SplitHostPort(u.Host) - if err == nil { - grpcAddresses = append(grpcAddresses, fmt.Sprintf("%s:%s", host, port)) - } - } - - for _, peer := range config.Config.CosignerConfig.Peers { - u, err := url.Parse(peer.P2PAddr) - if err != nil { - fmt.Printf("Error parsing peer URL: %v", err) - } else { - host, port, err := net.SplitHostPort(u.Host) - if err == nil { - grpcAddresses = append(grpcAddresses, fmt.Sprintf("%s:%s", host, port)) - } - } + return err } - grpcAddress := fmt.Sprintf("multi:///%s", strings.Join(grpcAddresses, ",")) - - fmt.Println(grpcAddress) + fmt.Printf("Broadcasting to address: %s\n", grpcAddress) conn, err := grpc.Dial(grpcAddress, grpc.WithDefaultServiceConfig(serviceConfig), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithDefaultCallOptions(grpc.WaitForReady(true)), @@ -88,18 +66,76 @@ horcrux elect 2 # elect specific leader`, leaderID = args[0] } - context, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second) + ctx, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second) defer cancelFunc() grpcClient := proto.NewCosignerGRPCClient(conn) _, err = grpcClient.TransferLeadership( - context, + ctx, &proto.CosignerGRPCTransferLeadershipRequest{LeaderID: leaderID}, ) if err != nil { return err } + res, err := grpcClient.GetLeader(ctx, &proto.CosignerGRPCGetLeaderRequest{}) + if err != nil { + return err + } + + fmt.Printf("Leader election successful. New leader: %s\n", res.Leader) + + return nil + }, +} + +var getLeaderCmd = &cobra.Command{ + Use: "leader", + Short: "Get current raft leader", + Args: cobra.NoArgs, + Example: `horcrux leader`, + SilenceUsage: true, + RunE: func(cmd *cobra.Command, args []string) (err error) { + if config.Config.CosignerConfig == nil { + return fmt.Errorf("cosigner configuration is not present in config file") + } + + if len(config.Config.CosignerConfig.Peers) == 0 { + return fmt.Errorf("cosigner configuration has no peers") + } + + retryOpts := []grpc_retry.CallOption{ + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(100 * time.Millisecond)), + grpc_retry.WithMax(5), + } + + grpcAddress, err := client.SanitizeAddress(config.Config.CosignerConfig.P2PListen) + if err != nil { + return err + } + + fmt.Printf("Request address: %s\n", grpcAddress) + conn, err := grpc.Dial(grpcAddress, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions(grpc.WaitForReady(true)), + grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...))) + if err != nil { + log.Fatalf("dialing failed: %v", err) + } + defer conn.Close() + + ctx, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second) + defer cancelFunc() + + grpcClient := proto.NewCosignerGRPCClient(conn) + + res, err := grpcClient.GetLeader(ctx, &proto.CosignerGRPCGetLeaderRequest{}) + if err != nil { + return err + } + + fmt.Printf("Current leader: %s\n", res.Leader) + return nil }, } diff --git a/cmd/horcrux/cmd/metrics.go b/cmd/horcrux/cmd/metrics.go new file mode 100644 index 00000000..1380d4e0 --- /dev/null +++ b/cmd/horcrux/cmd/metrics.go @@ -0,0 +1,99 @@ +package cmd + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/pprof" + "os" + "time" + + "github.com/armon/go-metrics" + gmprometheus "github.com/armon/go-metrics/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + tmlog "github.com/tendermint/tendermint/libs/log" +) + +func AddPrometheusMetrics(mux *http.ServeMux) { + logger := tmlog.NewTMLogger(tmlog.NewSyncWriter(os.Stdout)).With("module", "metrics") + + // Add metrics from raft's implementation of go-metrics + cfg := gmprometheus.DefaultPrometheusOpts + sink, err := gmprometheus.NewPrometheusSinkFrom(cfg) + if err != nil { + logger.Error("Could not configure Raft Metrics") + panic(err) + } + _, err = metrics.NewGlobal(metrics.DefaultConfig("horcrux"), sink) + if err != nil { + logger.Error("Could not add Raft Metrics") + panic(err) + } + + mux.Handle("/metrics", promhttp.Handler()) + logger.Info("Prometheus Metrics Listening", "address", config.Config.DebugAddr, "path", "/metrics") +} + +// EnableDebugAndMetrics - Initialization errors are not fatal, only logged +func EnableDebugAndMetrics(ctx context.Context) { + logger := tmlog.NewTMLogger(tmlog.NewSyncWriter(os.Stdout)).With("module", "debugserver") + + // Configure Shared Debug HTTP Server for pprof and prometheus + if len(config.Config.DebugAddr) == 0 { + logger.Info("debug-addr not defined; debug server disabled") + return + } + logger.Info("Debug Server Listening", "address", config.Config.DebugAddr) + + // Set up new mux identical to the default mux configuration in net/http/pprof. + mux := http.NewServeMux() + mux.HandleFunc("/debug/pprof/", pprof.Index) + mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + mux.HandleFunc("/debug/pprof/profile", pprof.Profile) + mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + mux.HandleFunc("/debug/pprof/trace", pprof.Trace) + + // And redirect the browser to the /debug/pprof root, + // so operators don't see a mysterious 404 page. + mux.Handle("/", http.RedirectHandler("/debug/pprof", http.StatusSeeOther)) + + // Add prometheus metrics + AddPrometheusMetrics(mux) + + // Configure Debug Server Network Parameters + srv := &http.Server{ + Handler: mux, + Addr: config.Config.DebugAddr, + ReadTimeout: 1 * time.Second, + WriteTimeout: 30 * time.Second, + IdleTimeout: 30 * time.Second, + ReadHeaderTimeout: 2 * time.Second, + } + + // Start Debug Server. + go func() { + if err := srv.ListenAndServe(); err != nil { + if errors.Is(err, http.ErrServerClosed) { + logger.Info("Debug Server Shutdown Complete") + return + } + logger.Error(fmt.Sprintf("Debug Endpoint failed to start: %+v", err)) + panic(err) + } + }() + + // Shutdown Debug Server on ctx request + go func() { + <-ctx.Done() + logger.Info("Gracefully Stopping Debug Server") + if err := srv.Shutdown(context.Background()); err != nil { + logger.Error("Error in Stopping Debug Server", err) + logger.Info("Force Stopping Debug Server") + if err = srv.Close(); err != nil { + logger.Error("Error in Force Stopping Debug Server", err) + } + } + }() + +} diff --git a/cmd/horcrux/cmd/state.go b/cmd/horcrux/cmd/state.go index 7f144bbd..b7b4c37c 100644 --- a/cmd/horcrux/cmd/state.go +++ b/cmd/horcrux/cmd/state.go @@ -102,6 +102,8 @@ func setStateCmd() *cobra.Command { return err } + fmt.Fprintf(cmd.OutOrStdout(), "Setting height %d\n", height) + pv.EphemeralPublic, share.EphemeralPublic = nil, nil signState := signer.SignStateConsensus{ Height: height, diff --git a/cmd/horcrux/cmd/state_test.go b/cmd/horcrux/cmd/state_test.go index 9f68622f..01512f20 100644 --- a/cmd/horcrux/cmd/state_test.go +++ b/cmd/horcrux/cmd/state_test.go @@ -2,7 +2,6 @@ package cmd import ( "io" - "os" "path/filepath" "strconv" "testing" @@ -13,14 +12,11 @@ import ( ) func TestStateSetCmd(t *testing.T) { - tmpHome := "/tmp/TestStateSetCmd" + tmpHome := t.TempDir() tmpConfig := filepath.Join(tmpHome, ".horcrux") chainid := "horcrux-1" - err := os.Setenv("HOME", tmpHome) - require.NoError(t, err) - err = os.MkdirAll(tmpHome, 0777) - require.NoError(t, err) + t.Setenv("HOME", tmpHome) cmd := initCmd() cmd.SetOutput(io.Discard) @@ -28,12 +24,13 @@ func TestStateSetCmd(t *testing.T) { chainid, "tcp://10.168.0.1:1234", "-c", + "-t", "2", "-p", "tcp://10.168.1.2:2222|2,tcp://10.168.1.3:2222|3", "-l", "tcp://10.168.1.1:2222", "-t", "2", "--timeout", "1500ms", }) - err = cmd.Execute() + err := cmd.Execute() require.NoError(t, err) tcs := []struct { @@ -90,8 +87,4 @@ func TestStateSetCmd(t *testing.T) { } }) } - - t.Cleanup(func() { - os.RemoveAll(tmpHome) - }) } diff --git a/docs/comparison.md b/docs/comparison.md new file mode 100644 index 00000000..8afe3664 --- /dev/null +++ b/docs/comparison.md @@ -0,0 +1,7 @@ +| | Num. Rounds | Robust | Num. Signers | Parallel Secure | +|--------------------|:--------------------:|:------:|:------------:|:---------------:| +| **Stinson Strobl** | 4 | Yes | t | Yes | +| **Gennaro et al.** | 1 with preprocessing | No | n | No | +| **FROST** | 1 with preprocessing | No | t | Yes | + +| **Stinson Strobl** is the only implement Threshold Schemes in Horcrux. However, its worth important to note that the key generation in Horcrux is not the same as proposed in the paper. Instead its "classic" shamir secret sharing with a fully trusted dealer. diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 00000000..a7090fc3 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,124 @@ +# Prometheus Metrics + +## Enabling Prometheus +Specify the port for incoming prometheus connections during 'config init' by using the -d flag. +``` +horcrux ..options.. -d 0.0.0.0:6001 +``` + +For earlier adopters, add the following key to your config.yaml + +debug-addr: 0.0.0.0:6001 + +Resulting in a configuration like the following: + +``` +chain-id: testnet-1 +cosigner: + threshold: 2 + shares: 3 + p2p-listen: tcp://localhost:5001 + peers: + - share-id: 2 + p2p-addr: tcp://localhost:5002 + - share-id: 3 + p2p-addr: tcp://localhost:5003 + rpc-timeout: 1500ms +chain-nodes: +- priv-val-addr: tcp://localhost:2300 +debug-addr: 0.0.0.0:6001 +``` + +## Prometheus Cautions + +Prometheus scrapes data every minute by default which is not fast enough to log metrics which change on a fast interval. + +Set the scrape_interval between 1 and 3 seconds in prometheus.yml if you wish to log/monitor these metrics. Note this will take more disk space. + +``` +global: + scrape_interval: 3s +``` + + +## Watching Single Signers + +Single node signers don't execute any cosigner code, so the basic metrics are: + * signer_seconds_since_last_precommit + * signer_seconds_since_last_prevote + * signer_last_precommit_height + * signer_last_prevote_height + +If the 'seconds_since' metrics exceeds the normal block time, it may indicate a sentry failure or a network stall/halt. + +If there are skips in the block heights requested to be signed the following counters will increase AFTER the sentry is able to report the latest block height. Until then, from the perspective of horcrux, it looks no different than a network stall. + * signer_total_missed_precommits + * signer_total_missed_prevotes + +## Watching Sentry Failure + +Watch 'signer_sentry_connect_tries' for any increase which indicates retry attempts to reach your sentry. + +If 'signer_total_sentry_connect_tries' is significant, it can indicate network or server issues. + +## Watching Cosigner With Grafana + +A sample Grafana configration is available. See [`horcrux.json`](https://github.com/chillyvee/horcrux-info/blob/master/grafana/horcrux.json) + + +## Watching For Cosigner Trouble +Metrics may vary between Cosigner processes since there is only one leader. + +Watch 'signer_missed_ephemeral_shares' which will note when the leader is not able to get a signature from the peer. If 'signer_total_missed_ephemeral_shares' increases to a high number, this may indicate a larger issue. + +Each block, Ephemeral Secrets are shared between Cosigners. Monitoring 'signer_seconds_since_last_local_ephemeral_share_time' and ensuring it does not exceed the block time will allow you to know when a Cosigner was not contacted for a block. + +## Metrics that don't always correspond to block time +There is no guarantee that a Cosigner will sign a block if the threshold is reached early. You may watch 'signer_seconds_since_last_local_sign_start_time' but there is no guarantee that 'signer_seconds_since_last_local_sign_finish_time' will be reached since there are multiple sanity checks that may cause an early exit in some circumstances (rather rare) + +## Metrics on the raft leader may be different +On the leader you may watch but these metrics will continue to rise on Cosigners who are not the raft leaders (since followers will rarely manage the original signing request) + * signer_seconds_since_last_precommit + * signer_seconds_since_last_prevote + +As a result, followers also do not update these metrics +* signer_last_precommit_height +* signer_last_prevote_height + + +## Checking Signing Performance +We currently only have metrics between the leader and followers (not full p2p metrics). However it is still useful in determining when a particular peer lags significantly. + +Your cluster should reach the threshold for availability in a short time. Monitor the following: + +``` +signer_sign_block_threshold_lag_seconds{quantile="0.5"} 0.019399953 +signer_sign_block_threshold_lag_seconds{quantile="0.9"} 0.028546635 +signer_sign_block_threshold_lag_seconds{quantile="0.99"} 0.029730841 +``` + +After reaching the threshold, all cosigners should sign quickly +``` +signer_sign_block_cosigner_lag_seconds{quantile="0.5"} 0.031424561 +signer_sign_block_cosigner_lag_seconds{quantile="0.9"} 0.0407505 +signer_sign_block_cosigner_lag_seconds{quantile="0.99"} 0.045173791 +``` + +If 'signer_sign_block_cosigner_lag_seconds' takes a significant amount of time, you can check the performance of each cosigner as it is seen by the raft leader. High numbers may indicate a high latency link or a resource. This metric is only available on the Leader and will report 'NaN' on followers. +``` +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5001",quantile="0.5"} 0.010391636 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5001",quantile="0.9"} 0.013242445 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5001",quantile="0.99"} 0.017128885 +signer_cosigner_sign_lag_seconds_sum{peerid="tcp://localhost:5001"} 1.1935657130000004 +signer_cosigner_sign_lag_seconds_count{peerid="tcp://localhost:5001"} 120 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5002",quantile="0.5"} 0.010473575 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5002",quantile="0.9"} 0.013052952 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5002",quantile="0.99"} 0.01732663 +signer_cosigner_sign_lag_seconds_sum{peerid="tcp://localhost:5002"} 1.014658521 +signer_cosigner_sign_lag_seconds_count{peerid="tcp://localhost:5002"} 103 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5003",quantile="0.5"} 0.010760536 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5003",quantile="0.9"} 0.012623563 +signer_cosigner_sign_lag_seconds{peerid="tcp://localhost:5003",quantile="0.99"} 0.016456836 +``` + + diff --git a/docs/migrating.md b/docs/migrating.md index 631e26d7..c57b0018 100644 --- a/docs/migrating.md +++ b/docs/migrating.md @@ -4,7 +4,7 @@ Before starting, \***\*please make sure to have a clear understanding of node and validator operational requirements\*\***. This guide is medium to high difficulty. Operation of `horcrux` assumes significant prior knowledge of these systems. Debugging problems that may arise will entail a significant amount financial risk (double sign) if you are running on mainnet so a clear understanding of the systems you are working with is important. Please attempt this operation on a testnet before you do so on a mainnet validator. -> **CAUTION:** This operation will require you to take your validator down for some time. If you work quickly and follow the guide, this downtime shouldn't be more than 5-10 minutes. But reguardless, be aware of the downtime slashing on your chain and be careful not to exceed that limit. +> **CAUTION:** This operation will require you to take your validator down for some time. If you work quickly and follow the guide, this downtime shouldn't be more than 5-10 minutes. But regardless, be aware of the downtime slashing on your chain and be careful not to exceed that limit. ## Validator System Migration @@ -14,7 +14,7 @@ This document will describe a migration from a "starting system" to a 2-of-3 mul - VM: 4 CPU, 16 GB RAM, 500GB SSD storage running fully synced chain daemon also acting as a validator -### Example Migration Infrastrcuture +### Example Migration Infrastructure - Sentries: 3x VM w/ 4 CPU, 16GB RAM, 500GB SSD storage running fully synced chain daemon - These chain daemons should only expose the `:26656` (p2p) port to the open internet @@ -45,7 +45,7 @@ signer-2: 10.168.1.2 signer-3: 10.168.1.3 ``` -When installing `horcrux` we recommend using the prebuilt binaries from the [releases page](https://github.com/strangelove-ventures/horcrux/releases). Pick the release cooresponding to the `tendermint` dependancy for the `go.mod` of your chain binary. You should be able to get this with `{binary} version --long`. Install like so: +When installing `horcrux` we recommend using the prebuilt binaries from the [releases page](https://github.com/strangelove-ventures/horcrux/releases). Pick the release corresponding to the `tendermint` dependency for the `go.mod` of your chain binary. You should be able to get this with `{binary} version --long`. Install like so: ```bash # On each signer VM @@ -89,11 +89,11 @@ $ horcrux config init {my_chain_id} "tcp://10.168.0.3:1234" -c -p "tcp://10.168. > **NOTE:** The `-k` or `--keyfile` flag lets you set the file path for the private key share file if you would like to use a different path than `~/.horcrux/share.json`. -> **NOTE:** The `--timeout` value defaults to `1000ms`. If you are running in disconnected data centers (i.e. accross amazon AZs or gcp zones) increasing the timeout slightly helps to avoid missed blocks especially around proposals. +> **NOTE:** The `--timeout` value defaults to `1000ms`. If you are running in disconnected data centers (i.e. across amazon AZs or gcp zones) increasing the timeout slightly helps to avoid missed blocks especially around proposals. ### 3. Split `priv_validator_key.json` and distribute key material -> **CAUTION:** **The security of any key material is outside the scope of this guide. The suggested proceedure here is not necessarily the one you will use. We aim to make this guide easy to understand, not necessarily the most secure. The tooling here is all written in go and can be compiled and used in an airgapped setup if needed. Please open issues if you have questions around how to fit `horcrux` into your infra.** +> **CAUTION:** **The security of any key material is outside the scope of this guide. The suggested procedure here is not necessarily the one you will use. We aim to make this guide easy to understand, not necessarily the most secure. The tooling here is all written in go and can be compiled and used in an airgapped setup if needed. Please open issues if you have questions about how to fit `horcrux` into your infra.** On some computer that contains your `priv_validator_key.json` create a folder to split the key through the following command. This may take a moment o complete: diff --git a/go.mod b/go.mod index 38c10e1f..2c87f935 100644 --- a/go.mod +++ b/go.mod @@ -5,14 +5,14 @@ go 1.19 require ( github.com/Jille/grpc-multi-resolver v1.1.0 github.com/Jille/raft-grpc-leader-rpc v1.1.0 - github.com/Jille/raft-grpc-transport v1.2.0 + github.com/Jille/raft-grpc-transport v1.2.1-0.20220914172309-2f253856eefc github.com/Jille/raftadmin v1.2.0 github.com/avast/retry-go v3.0.0+incompatible github.com/cosmos/cosmos-sdk v0.44.5 github.com/gogo/protobuf v1.3.3 github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 - github.com/hashicorp/raft v1.3.3 - github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 + github.com/hashicorp/raft v1.3.10 + github.com/hashicorp/raft-boltdb/v2 v2.2.2 github.com/mitchellh/go-homedir v1.1.0 github.com/ory/dockertest v3.3.5+incompatible github.com/spf13/cobra v1.2.1 @@ -77,9 +77,11 @@ require ( github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c // indirect github.com/gtank/merlin v0.1.1 // indirect github.com/gtank/ristretto255 v0.1.2 // indirect + github.com/hashicorp/errwrap v1.0.0 // indirect github.com/hashicorp/go-hclog v0.16.2 // indirect github.com/hashicorp/go-immutable-radix v1.3.1 // indirect github.com/hashicorp/go-msgpack v1.1.5 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/hdevalence/ed25519consensus v0.0.0-20210204194344-59a8610d2b87 // indirect diff --git a/go.sum b/go.sum index e0bedb76..db3822a8 100644 --- a/go.sum +++ b/go.sum @@ -75,6 +75,8 @@ github.com/Jille/raft-grpc-leader-rpc v1.1.0 h1:u36rmA4tjp+4FSdZ17jg/1sfSCYNQIe5 github.com/Jille/raft-grpc-leader-rpc v1.1.0/go.mod h1:l+pK+uPuqpFDFcPmyUPSng4257UXrST0Vc3Lo4XwVB0= github.com/Jille/raft-grpc-transport v1.2.0 h1:W/YSPz8IsirEyomjKmDog5Xk71o9+l4KhyMEX2TsgSs= github.com/Jille/raft-grpc-transport v1.2.0/go.mod h1:GQGUXJfjlzwA390Ox1AyVYpjCLhtGd6yqY9Sb5hpQfc= +github.com/Jille/raft-grpc-transport v1.2.1-0.20220914172309-2f253856eefc h1:xF58NlLrijxTgZ/sfwUEVFJj/y0v2SxdIPoyHlLEjxI= +github.com/Jille/raft-grpc-transport v1.2.1-0.20220914172309-2f253856eefc/go.mod h1:77bQXfQSgLTAn1Iwi9MJDNE7KwPmdeW42Pd4HUHdl9E= github.com/Jille/raftadmin v1.2.0 h1:hMLFUK7iKpeXP+CoIhNMWj+F53XOLSjMDSia0C60cps= github.com/Jille/raftadmin v1.2.0/go.mod h1:vtVEpToPGTUPVwwunypWDpi69JpdnHMhWRUlc/65U+Y= github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0= @@ -111,7 +113,6 @@ github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hC github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878/go.mod h1:3AMJUQhVx52RsWOnlkpikZr01T/yAVN2gn0861vByNg= -github.com/armon/go-metrics v0.3.8/go.mod h1:4O98XIr/9W0sxpJ8UaYkvjk10Iff7SnFrb4QAOwNTFc= github.com/armon/go-metrics v0.3.9 h1:O2sNqxBdvq8Eq5xmzljcYzAORli6RWCvEym4cJf9m18= github.com/armon/go-metrics v0.3.9/go.mod h1:4O98XIr/9W0sxpJ8UaYkvjk10Iff7SnFrb4QAOwNTFc= github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= @@ -448,6 +449,7 @@ github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBt github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoPndWW5VkKPlCE= github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/consul/sdk v0.3.0/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= @@ -462,6 +464,8 @@ github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iP github.com/hashicorp/go-msgpack v1.1.5 h1:9byZdVjKTe5mce63pRVNP1L7UAmdHOTEMGehn6KvJWs= github.com/hashicorp/go-msgpack v1.1.5/go.mod h1:gWVc3sv/wbDmR3rQsj1CAktEZzoz1YNK9NfGLXJ69/4= github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= @@ -485,9 +489,13 @@ github.com/hashicorp/raft v1.1.2/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7 github.com/hashicorp/raft v1.3.1/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= github.com/hashicorp/raft v1.3.3 h1:Xr6DSHC5cIM8kzxu+IgoT/+MeNeUNeWin3ie6nlSrMg= github.com/hashicorp/raft v1.3.3/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= +github.com/hashicorp/raft v1.3.10 h1:LR5QZX1VQd0DFWZfeCwWawyeKfpS/Tm1yjnJIY5X4Tw= +github.com/hashicorp/raft v1.3.10/go.mod h1:J8naEwc6XaaCfts7+28whSeRvCqTd6e20BlCU3LtEO4= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= -github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 h1:Ye8SofeDHJzu9xvvaMmpMkqHELWW7rTcXwdUR0CWW48= -github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42/go.mod h1:wcXL8otVu5cpJVLjcmq7pmfdRCdaP+xnvu7WQcKJAhs= +github.com/hashicorp/raft-boltdb v0.0.0-20210409134258-03c10cc3d4ea h1:RxcPJuutPRM8PUOyiweMmkuNO+RJyfy2jds2gfvgNmU= +github.com/hashicorp/raft-boltdb v0.0.0-20210409134258-03c10cc3d4ea/go.mod h1:qRd6nFJYYS6Iqnc/8HcUmko2/2Gw8qTFEmxDLii6W5I= +github.com/hashicorp/raft-boltdb/v2 v2.2.2 h1:rlkPtOllgIcKLxVT4nutqlTH2NRFn+tO1wwZk/4Dxqw= +github.com/hashicorp/raft-boltdb/v2 v2.2.2/go.mod h1:N8YgaZgNJLpZC+h+by7vDu5rzsRgONThTEeUS3zWbfY= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= github.com/hdevalence/ed25519consensus v0.0.0-20210204194344-59a8610d2b87 h1:uUjLpLt6bVvZ72SQc/B4dXcPBw4Vgd7soowdRl52qEM= github.com/hdevalence/ed25519consensus v0.0.0-20210204194344-59a8610d2b87/go.mod h1:XGsKKeXxeRr95aEOgipvluMPlgjr7dGlk9ZTWOjcUcg= @@ -902,6 +910,7 @@ go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.1.12/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/signer/grpc_server.go b/signer/grpc_server.go index e524a847..1ab9b3bb 100644 --- a/signer/grpc_server.go +++ b/signer/grpc_server.go @@ -44,8 +44,14 @@ func (rpc *GRPCServer) SetEphemeralSecretPartsAndSign( SignBytes: req.GetSignBytes(), }) if err != nil { + rpc.raftStore.logger.Error("Failed to sign with share", "error", err) return nil, err } + rpc.raftStore.logger.Info("Signed with share", + "height", req.Hrst.Height, + "round", req.Hrst.Round, + "step", req.Hrst.Step, + ) return &proto.CosignerGRPCSetEphemeralSecretPartsAndSignResponse{ EphemeralPublic: res.EphemeralPublic, Timestamp: res.Timestamp.UnixNano(), @@ -86,3 +92,11 @@ func (rpc *GRPCServer) TransferLeadership( rpc.raftStore.raft.LeadershipTransfer() return &proto.CosignerGRPCTransferLeadershipResponse{}, nil } + +func (rpc *GRPCServer) GetLeader( + ctx context.Context, + req *proto.CosignerGRPCGetLeaderRequest, +) (*proto.CosignerGRPCGetLeaderResponse, error) { + leader := rpc.raftStore.GetLeader() + return &proto.CosignerGRPCGetLeaderResponse{Leader: string(leader)}, nil +} diff --git a/signer/local_cosigner.go b/signer/local_cosigner.go index 5ebd371a..42d39a2a 100644 --- a/signer/local_cosigner.go +++ b/signer/local_cosigner.go @@ -4,8 +4,52 @@ import ( "crypto/rsa" "sync" "time" + + tmcryptoed25519 "github.com/tendermint/tendermint/crypto/ed25519" + tmjson "github.com/tendermint/tendermint/libs/json" + "gitlab.com/unit410/edwards25519" + tsed25519 "gitlab.com/unit410/threshold-ed25519/pkg" ) +type LastSignStateWrapper struct { + // Signing is thread safe - lastSignStateMutex is used for putting locks so only one goroutine can r/w to the function + lastSignStateMutex sync.Mutex + + // lastSignState stores the last sign state for a share we have fully signed + // incremented whenever we are asked to sign a share + LastSignState *SignState +} + +// return true if we are less than the other key +func (hrst *HRSTKey) Less(other HRSTKey) bool { + if hrst.Height < other.Height { + return true + } + + if hrst.Height > other.Height { + return false + } + + // height is equal, check round + + if hrst.Round < other.Round { + return true + } + + if hrst.Round > other.Round { + return false + } + + // round is equal, check step + + if hrst.Step < other.Step { + return true + } + + // HRS is greater or equal + return false +} + type CosignerPeer struct { ID int PublicKey rsa.PublicKey @@ -19,13 +63,15 @@ type CosignerGetEphemeralSecretPartRequest struct { Timestamp time.Time } -type LastSignStateStruct struct { - // Signing is thread safe - lastSignStateMutex is used for putting locks so only one goroutine can r/w to the function - LastSignStateMutex sync.Mutex - - // lastSignState stores the last sign state for a share we have fully signed - // incremented whenever we are asked to sign a share - LastSignState *SignState +type LocalCosignerConfig struct { + CosignerKey CosignerKey + SignState *SignState + RsaKey rsa.PrivateKey + Peers []CosignerPeer + Address string + RaftAddress string + Total uint8 + Threshold uint8 } // LocalCosigner responds to sign requests using their share key @@ -59,8 +105,13 @@ func NewLocalCosigner( Peers: make(map[int]CosignerPeer), } - for _, peer := range peers { - cosigner.Peers[peer.ID] = peer + // cache the public key bytes for signing operations + switch ed25519Key := cosigner.key.PubKey.(type) { + case tmcryptoed25519.PubKey: + cosigner.pubKeyBytes = make([]byte, len(ed25519Key)) + copy(cosigner.pubKeyBytes, ed25519Key[:]) + default: + panic("Not an ed25519 public key") } return cosigner @@ -84,10 +135,151 @@ func (cosigner *LocalCosigner) GetAddress() string { return cosigner.address } +// Sign the sign request using the cosigner's share +// Return the signed bytes or an error +// Implements Cosigner interface +func (cosigner *LocalCosigner) sign(req CosignerSignRequest) (CosignerSignResponse, error) { + // This function has multiple exit points. Only start time can be guaranteed + metricsTimeKeeper.SetPreviousLocalSignStart(time.Now()) + + cosigner.lastSignStateMutex.Lock() + defer cosigner.lastSignStateMutex.Unlock() + + res := CosignerSignResponse{} + lss := cosigner.lastSignState + + hrst, err := UnpackHRST(req.SignBytes) + if err != nil { + return res, err + } + + sameHRS, err := lss.CheckHRS(hrst) + if err != nil { + return res, err + } + + // If the HRS is the same the sign bytes may still differ by timestamp + // It is ok to re-sign a different timestamp if that is the only difference in the sign bytes + if sameHRS { + if bytes.Equal(req.SignBytes, lss.SignBytes) { + res.EphemeralPublic = lss.EphemeralPublic + res.Signature = lss.Signature + return res, nil + } else if err := lss.OnlyDifferByTimestamp(req.SignBytes); err != nil { + return res, err + } + + // same HRS, and only differ by timestamp - ok to sign again + } + + meta, ok := cosigner.hrsMeta[hrst] + if !ok { + return res, errors.New("no metadata at HRS") + } + + shareParts := make([]tsed25519.Scalar, 0) + publicKeys := make([]tsed25519.Element, 0) + + // calculate secret and public keys + for _, peer := range meta.Peers { + if len(peer.Share) == 0 { + continue + } + shareParts = append(shareParts, peer.Share) + publicKeys = append(publicKeys, peer.EphemeralSecretPublicKey) + } + + ephemeralShare := tsed25519.AddScalars(shareParts) + ephemeralPublic := tsed25519.AddElements(publicKeys) + + // check bounds for ephemeral share to avoid passing out of bounds valids to SignWithShare + { + if len(ephemeralShare) != 32 { + return res, errors.New("ephemeral share is out of bounds") + } + + var scalarBytes [32]byte + copy(scalarBytes[:], ephemeralShare) + if !edwards25519.ScMinimal(&scalarBytes) { + return res, errors.New("ephemeral share is out of bounds") + } + } + + sig := tsed25519.SignWithShare( + req.SignBytes, cosigner.key.ShareKey, ephemeralShare, cosigner.pubKeyBytes, ephemeralPublic) + + cosigner.lastSignState.EphemeralPublic = ephemeralPublic + err = cosigner.lastSignState.Save(SignStateConsensus{ + Height: hrst.Height, + Round: hrst.Round, + Step: hrst.Step, + Signature: sig, + SignBytes: req.SignBytes, + }, nil, true) + + if err != nil { + if _, isSameHRSError := err.(*SameHRSError); !isSameHRSError { + return res, err + } + } + + for existingKey := range cosigner.hrsMeta { + // delete any HRS lower than our signed level + // we will not be providing parts for any lower HRS + if existingKey.Less(hrst) { + delete(cosigner.hrsMeta, existingKey) + } + } + + res.EphemeralPublic = ephemeralPublic + res.Signature = sig + + // Note - Function may return before this line so elapsed time for Finish may be multiple block times + metricsTimeKeeper.SetPreviousLocalSignFinish(time.Now()) + + return res, nil +} + +func (cosigner *LocalCosigner) dealShares(req CosignerGetEphemeralSecretPartRequest) (HrsMetadata, error) { + hrsKey := HRSTKey{ + Height: req.Height, + Round: req.Round, + Step: req.Step, + Timestamp: req.Timestamp.UnixNano(), + } + + meta, ok := cosigner.hrsMeta[hrsKey] + + if ok { + return meta, nil + } + + secret := make([]byte, 32) + if _, err := rand.Read(secret); err != nil { + return HrsMetadata{}, err + } + + meta = HrsMetadata{ + Secret: secret, + Peers: make([]PeerMetadata, cosigner.total), + } + + // split this secret with shamirs + // !! dealt shares need to be saved because dealing produces different shares each time! + meta.DealtShares = tsed25519.DealShares(meta.Secret, cosigner.threshold, cosigner.total) + + cosigner.hrsMeta[hrsKey] = meta + + return meta, nil + +} + // GetEphemeralSecretParts // // Implements the Cosigner interface from Cosigner.go func (cosigner *LocalCosigner) GetEphemeralSecretParts( hrst HRSTKey) (*CosignerEphemeralSecretPartsResponse, error) { + metricsTimeKeeper.SetPreviousLocalEphemeralShare(time.Now()) + res := &CosignerEphemeralSecretPartsResponse{ EncryptedSecrets: make([]CosignerEphemeralSecretPart, 0, len(cosigner.Peers)-1), } @@ -113,6 +305,160 @@ func (cosigner *LocalCosigner) GetEphemeralSecretParts( return res, nil } +// Get the ephemeral secret part for an ephemeral share +// The ephemeral secret part is encrypted for the receiver +func (cosigner *LocalCosigner) getEphemeralSecretPart( + req CosignerGetEphemeralSecretPartRequest) (CosignerEphemeralSecretPart, error) { + res := CosignerEphemeralSecretPart{} + + // protects the meta map + cosigner.lastSignStateMutex.Lock() + defer cosigner.lastSignStateMutex.Unlock() + + hrst := HRSTKey{ + Height: req.Height, + Round: req.Round, + Step: req.Step, + Timestamp: req.Timestamp.UnixNano(), + } + + meta, ok := cosigner.hrsMeta[hrst] + // generate metadata placeholder + if !ok { + newMeta, err := cosigner.dealShares(CosignerGetEphemeralSecretPartRequest{ + Height: req.Height, + Round: req.Round, + Step: req.Step, + Timestamp: req.Timestamp, + }) + + if err != nil { + return res, err + } + + meta = newMeta + cosigner.hrsMeta[hrst] = meta + } + + ourEphPublicKey := tsed25519.ScalarMultiplyBase(meta.Secret) + + // set our values + meta.Peers[cosigner.key.ID-1].Share = meta.DealtShares[cosigner.key.ID-1] + meta.Peers[cosigner.key.ID-1].EphemeralSecretPublicKey = ourEphPublicKey + + // grab the peer info for the ID being requested + peer, ok := cosigner.peers[req.ID] + if !ok { + return res, errors.New("unknown peer ID") + } + + sharePart := meta.DealtShares[req.ID-1] + + // use RSA public to encrypt user's share part + encrypted, err := rsa.EncryptOAEP(sha256.New(), rand.Reader, &peer.PublicKey, sharePart, nil) + if err != nil { + return res, err + } + + res.SourceID = cosigner.key.ID + res.SourceEphemeralSecretPublicKey = ourEphPublicKey + res.EncryptedSharePart = encrypted + + // sign the response payload with our private key + // cosigners can verify the signature to confirm sender validity + { + jsonBytes, err := tmjson.Marshal(res) + + if err != nil { + return res, err + } + + digest := sha256.Sum256(jsonBytes) + signature, err := rsa.SignPSS(rand.Reader, &cosigner.rsaKey, crypto.SHA256, digest[:], nil) + if err != nil { + return res, err + } + + res.SourceSig = signature + } + + res.DestinationID = req.ID + + return res, nil +} + +// Store an ephemeral secret share part provided by another cosigner +func (cosigner *LocalCosigner) setEphemeralSecretPart(req CosignerSetEphemeralSecretPartRequest) error { + // Verify the source signature + { + if req.SourceSig == nil { + return errors.New("SourceSig field is required") + } + + digestMsg := CosignerEphemeralSecretPart{} + digestMsg.SourceID = req.SourceID + digestMsg.SourceEphemeralSecretPublicKey = req.SourceEphemeralSecretPublicKey + digestMsg.EncryptedSharePart = req.EncryptedSharePart + + digestBytes, err := tmjson.Marshal(digestMsg) + if err != nil { + return err + } + + digest := sha256.Sum256(digestBytes) + peer, ok := cosigner.peers[req.SourceID] + + if !ok { + return fmt.Errorf("unknown cosigner: %d", req.SourceID) + } + + peerPub := peer.PublicKey + err = rsa.VerifyPSS(&peerPub, crypto.SHA256, digest[:], req.SourceSig, nil) + if err != nil { + return err + } + } + + // protects the meta map + cosigner.lastSignStateMutex.Lock() + defer cosigner.lastSignStateMutex.Unlock() + + hrst := HRSTKey{ + Height: req.Height, + Round: req.Round, + Step: req.Step, + Timestamp: req.Timestamp.UnixNano(), + } + + meta, ok := cosigner.hrsMeta[hrst] + // generate metadata placeholder + if !ok { + newMeta, err := cosigner.dealShares(CosignerGetEphemeralSecretPartRequest{ + Height: req.Height, + Round: req.Round, + Step: req.Step, + }) + + if err != nil { + return err + } + + meta = newMeta + cosigner.hrsMeta[hrst] = meta + } + + // decrypt share + sharePart, err := rsa.DecryptOAEP(sha256.New(), rand.Reader, &cosigner.rsaKey, req.EncryptedSharePart, nil) + if err != nil { + return err + } + + // set slot + meta.Peers[req.SourceID-1].Share = sharePart + meta.Peers[req.SourceID-1].EphemeralSecretPublicKey = req.SourceEphemeralSecretPublicKey + return nil +} + // SetEphemeralSecretPartsAndSign // Implements the Cosigner interface from Cosigner.go func (cosigner *LocalCosigner) SetEphemeralSecretPartsAndSign( diff --git a/signer/metrics.go b/signer/metrics.go new file mode 100644 index 00000000..5709a06a --- /dev/null +++ b/signer/metrics.go @@ -0,0 +1,262 @@ +package signer + +import ( + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +type metricsTimer struct { + mu sync.Mutex + previousPrecommit, previousPrevote time.Time + previousLocalSignStart, previousLocalSignFinish time.Time + previousLocalEphemeralShare time.Time +} + +func newMetricsTimer() *metricsTimer { + now := time.Now() + return &metricsTimer{ + mu: sync.Mutex{}, + previousPrecommit: now, previousPrevote: now, + previousLocalSignStart: now, previousLocalSignFinish: now, + previousLocalEphemeralShare: now, + } +} + +func (mt *metricsTimer) SetPreviousPrecommit(t time.Time) { + mt.mu.Lock() + defer mt.mu.Unlock() + mt.previousPrecommit = t +} + +func (mt *metricsTimer) SetPreviousPrevote(t time.Time) { + mt.mu.Lock() + defer mt.mu.Unlock() + mt.previousPrevote = t +} + +func (mt *metricsTimer) SetPreviousLocalSignStart(t time.Time) { + mt.mu.Lock() + defer mt.mu.Unlock() + mt.previousLocalSignStart = t +} + +func (mt *metricsTimer) SetPreviousLocalSignFinish(t time.Time) { + mt.mu.Lock() + defer mt.mu.Unlock() + mt.previousLocalSignFinish = t +} + +func (mt *metricsTimer) SetPreviousLocalEphemeralShare(t time.Time) { + mt.mu.Lock() + defer mt.mu.Unlock() + mt.previousLocalEphemeralShare = t +} + +func (mt *metricsTimer) UpdatePrometheusMetrics(t time.Time) { + mt.mu.Lock() + defer mt.mu.Unlock() + + // Update Prometheus Gauges + secondsSinceLastPrecommit.Set(time.Since(mt.previousPrecommit).Seconds()) + secondsSinceLastPrevote.Set(time.Since(mt.previousPrevote).Seconds()) + secondsSinceLastLocalSignStart.Set(time.Since(mt.previousLocalSignStart).Seconds()) + secondsSinceLastLocalSignFinish.Set(time.Since(mt.previousLocalSignFinish).Seconds()) + secondsSinceLastLocalEphemeralShareTime.Set(time.Since(mt.previousLocalEphemeralShare).Seconds()) +} + +var ( + // Variables to calculate Prometheus Metrics + previousPrecommitHeight = int64(0) + previousPrevoteHeight = int64(0) + metricsTimeKeeper = newMetricsTimer() + + // Prometheus Metrics + totalPubKeyRequests = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_pubkey_requests", + Help: "Total times public key requested (High count may indicate validator restarts)", + }) + lastPrecommitHeight = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_last_precommit_height", + Help: "Last Height Precommit Signed", + }) + lastPrevoteHeight = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_last_prevote_height", + Help: "Last Height Prevote Signed", + }) + + lastProposalHeight = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_last_proposal_height", + Help: "Last Height Proposal Signed", + }) + lastPrecommitRound = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_last_precommit_round", + Help: "Last Round Precommit Signed", + }) + lastPrevoteRound = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_last_prevote_round", + Help: "Last Round Prevote Signed", + }) + lastProposalRound = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_last_proposal_round", + Help: "Last Round Proposal Signed", + }) + + totalPrecommitsSigned = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_precommits_signed", + Help: "Total Precommit Signed", + }) + totalPrevotesSigned = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_prevotes_signed", + Help: "Total Prevote Signed", + }) + totalProposalsSigned = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_proposals_signed", + Help: "Total Proposal Signed", + }) + + secondsSinceLastPrecommit = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_seconds_since_last_precommit", + Help: "Seconds Since Last Precommit (Useful for Signing Co-Signer Node, Single Signer)", + }) + secondsSinceLastPrevote = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_seconds_since_last_prevote", + Help: "Seconds Since Last Prevote (Useful for Signing Co-Signer Node, Single Signer)", + }) + secondsSinceLastLocalSignStart = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_seconds_since_last_local_sign_start_time", + Help: "Seconds Since Last Local Start Sign (May increase beyond block time, Rarely important) ", + }) + secondsSinceLastLocalSignFinish = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_seconds_since_last_local_sign_finish_time", + Help: "Seconds Since Last Local Finish Sign (Should stay below 2 * Block Time)", + }) + + secondsSinceLastLocalEphemeralShareTime = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_seconds_since_last_local_ephemeral_share_time", + Help: "Seconds Since Last Local Ephemeral Share Sign " + + "(Should not increase beyond block time; If high, may indicate raft joining issue for CoSigner) ", + }) + + missedPrecommits = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_missed_precommits", + Help: "Consecutive Precommit Missed", + }) + missedPrevotes = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_missed_prevotes", + Help: "Consecutive Prevote Missed", + }) + totalMissedPrecommits = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_missed_precommits", + Help: "Total Precommit Missed", + }) + totalMissedPrevotes = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_missed_prevotes", + Help: "Total Prevote Missed", + }) + + missedEphemeralShares = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "signer_missed_ephemeral_shares", + Help: "Consecutive Threshold Signature Parts Missed", + }, + []string{"peerid"}, + ) + totalMissedEphemeralShares = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "signer_total_missed_ephemeral_shares", + Help: "Total Threshold Signature Parts Missed", + }, + []string{"peerid"}, + ) + + sentryConnectTries = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "signer_sentry_connect_tries", + Help: "Consecutive Number of times sentry TCP connect has been tried (High count may indicate validator restarts)", + }) + totalSentryConnectTries = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_sentry_connect_tries", + Help: "Total Number of times sentry TCP connect has been tried (High count may indicate validator restarts)", + }) + + beyondBlockErrors = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_beyond_block_errors", + Help: "Total Times Signing Started but duplicate height/round request arrives", + }) + failedSignVote = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_failed_sign_vote", + Help: "Total Times Signer Failed to sign block - Unstarted and Unexepcted Height", + }) + + totalRaftLeader = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_raft_leader", + Help: "Total Times Signer is Raft Leader", + }) + totalNotRaftLeader = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_raft_not_leader", + Help: "Total Times Signer is NOT Raft Leader (Proxy signing to Raft Leader)", + }) + totalRaftLeaderElectiontimeout = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_total_raft_leader_election_timeout", + Help: "Total Times Raft Leader Failed Election (Lacking Peers)", + }) + + totalInvalidSignature = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_error_total_invalid_signatures", + Help: "Total Times Combined Signature is Invalid", + }) + + totalInsufficientCosigners = promauto.NewCounter(prometheus.CounterOpts{ + Name: "signer_error_total_insufficient_cosigners", + Help: "Total Times Cosigners doesn't reach threshold", + }) + + timedSignBlockThresholdLag = promauto.NewSummary(prometheus.SummaryOpts{ + Name: "signer_sign_block_threshold_lag_seconds", + Help: "Seconds taken to get threshold of cosigners available", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }) + + timedSignBlockCosignerLag = promauto.NewSummary(prometheus.SummaryOpts{ + Name: "signer_sign_block_cosigner_lag_seconds", + Help: "Seconds taken to get all cosigner signatures", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }) + + timedSignBlockLag = promauto.NewSummary(prometheus.SummaryOpts{ + Name: "signer_sign_block_lag_seconds", + Help: "Seconds taken to sign block", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }) + + timedCosignerEphemeralShareLag = promauto.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "signer_cosigner_ephemeral_share_lag_seconds", + Help: "Time taken to get cosigner ephemeral share", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }, + []string{"peerid"}, + ) + timedCosignerSignLag = promauto.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "signer_cosigner_sign_lag_seconds", + Help: "Time taken to get cosigner signature", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }, + []string{"peerid"}, + ) +) + +func StartMetrics() { + // Update elapsed times on an interval basis + for { + metricsTimeKeeper.UpdatePrometheusMetrics(time.Now()) + + // Prometheus often only polls every 1 to every few seconds + // Frequent updates minimize reporting error. + // Accuracy of 100ms is probably sufficient + <-time.After(100 * time.Millisecond) + } +} diff --git a/signer/proto/cosigner_grpc_server.pb.go b/signer/proto/cosigner_grpc_server.pb.go index a49aaada..f8836e28 100644 --- a/signer/proto/cosigner_grpc_server.pb.go +++ b/signer/proto/cosigner_grpc_server.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.27.1 -// protoc v3.18.1 +// protoc-gen-go v1.28.1 +// protoc v3.21.6 // source: signer/proto/cosigner_grpc_server.proto package proto @@ -673,6 +673,91 @@ func (x *CosignerGRPCTransferLeadershipResponse) GetLeaderAddress() string { return "" } +type CosignerGRPCGetLeaderRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields +} + +func (x *CosignerGRPCGetLeaderRequest) Reset() { + *x = CosignerGRPCGetLeaderRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_signer_proto_cosigner_grpc_server_proto_msgTypes[11] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CosignerGRPCGetLeaderRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CosignerGRPCGetLeaderRequest) ProtoMessage() {} + +func (x *CosignerGRPCGetLeaderRequest) ProtoReflect() protoreflect.Message { + mi := &file_signer_proto_cosigner_grpc_server_proto_msgTypes[11] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CosignerGRPCGetLeaderRequest.ProtoReflect.Descriptor instead. +func (*CosignerGRPCGetLeaderRequest) Descriptor() ([]byte, []int) { + return file_signer_proto_cosigner_grpc_server_proto_rawDescGZIP(), []int{11} +} + +type CosignerGRPCGetLeaderResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Leader string `protobuf:"bytes,1,opt,name=leader,proto3" json:"leader,omitempty"` +} + +func (x *CosignerGRPCGetLeaderResponse) Reset() { + *x = CosignerGRPCGetLeaderResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_signer_proto_cosigner_grpc_server_proto_msgTypes[12] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CosignerGRPCGetLeaderResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CosignerGRPCGetLeaderResponse) ProtoMessage() {} + +func (x *CosignerGRPCGetLeaderResponse) ProtoReflect() protoreflect.Message { + mi := &file_signer_proto_cosigner_grpc_server_proto_msgTypes[12] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CosignerGRPCGetLeaderResponse.ProtoReflect.Descriptor instead. +func (*CosignerGRPCGetLeaderResponse) Descriptor() ([]byte, []int) { + return file_signer_proto_cosigner_grpc_server_proto_rawDescGZIP(), []int{12} +} + +func (x *CosignerGRPCGetLeaderResponse) GetLeader() string { + if x != nil { + return x.Leader + } + return "" +} + var File_signer_proto_cosigner_grpc_server_proto protoreflect.FileDescriptor var file_signer_proto_cosigner_grpc_server_proto_rawDesc = []byte{ @@ -763,43 +848,54 @@ var file_signer_proto_cosigner_grpc_server_proto_rawDesc = []byte{ 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x49, 0x44, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x49, 0x44, 0x12, 0x24, 0x0a, 0x0d, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, - 0x0d, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x32, 0xfc, - 0x03, 0x0a, 0x0c, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x12, - 0x58, 0x0a, 0x09, 0x53, 0x69, 0x67, 0x6e, 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x12, 0x23, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, - 0x43, 0x53, 0x69, 0x67, 0x6e, 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x24, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, - 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x53, 0x69, 0x67, 0x6e, 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x52, - 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x97, 0x01, 0x0a, 0x1e, 0x53, 0x65, - 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, - 0x50, 0x61, 0x72, 0x74, 0x73, 0x41, 0x6e, 0x64, 0x53, 0x69, 0x67, 0x6e, 0x12, 0x38, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, - 0x43, 0x53, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, - 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x41, 0x6e, 0x64, 0x53, 0x69, 0x67, 0x6e, 0x52, - 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x39, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, - 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x53, 0x65, 0x74, 0x45, 0x70, - 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, - 0x74, 0x73, 0x41, 0x6e, 0x64, 0x53, 0x69, 0x67, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, - 0x65, 0x22, 0x00, 0x12, 0x82, 0x01, 0x0a, 0x17, 0x47, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, - 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x12, - 0x31, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, - 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, - 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, - 0x73, 0x74, 0x1a, 0x32, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, - 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, - 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x73, 0x0a, 0x12, 0x54, 0x72, 0x61, 0x6e, - 0x73, 0x66, 0x65, 0x72, 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x73, 0x68, 0x69, 0x70, 0x12, 0x2c, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, - 0x52, 0x50, 0x43, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x65, 0x72, 0x4c, 0x65, 0x61, 0x64, 0x65, - 0x72, 0x73, 0x68, 0x69, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2d, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, - 0x43, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x65, 0x72, 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x73, - 0x68, 0x69, 0x70, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x36, 0x5a, - 0x34, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x74, 0x72, 0x61, - 0x6e, 0x67, 0x65, 0x6c, 0x6f, 0x76, 0x65, 0x2d, 0x76, 0x65, 0x6e, 0x74, 0x75, 0x72, 0x65, 0x73, - 0x2f, 0x68, 0x6f, 0x72, 0x63, 0x72, 0x75, 0x78, 0x2f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x2f, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x0d, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x22, 0x1e, + 0x0a, 0x1c, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, + 0x74, 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x22, 0x37, + 0x0a, 0x1d, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, + 0x74, 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, + 0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x32, 0xd6, 0x04, 0x0a, 0x0c, 0x43, 0x6f, 0x73, 0x69, + 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x12, 0x58, 0x0a, 0x09, 0x53, 0x69, 0x67, 0x6e, + 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x12, 0x23, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, + 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x53, 0x69, 0x67, 0x6e, 0x42, 0x6c, + 0x6f, 0x63, 0x6b, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x53, + 0x69, 0x67, 0x6e, 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, + 0x22, 0x00, 0x12, 0x97, 0x01, 0x0a, 0x1e, 0x53, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, + 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x41, 0x6e, + 0x64, 0x53, 0x69, 0x67, 0x6e, 0x12, 0x38, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, + 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x53, 0x65, 0x74, 0x45, 0x70, 0x68, + 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, + 0x73, 0x41, 0x6e, 0x64, 0x53, 0x69, 0x67, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x39, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, + 0x47, 0x52, 0x50, 0x43, 0x53, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, + 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x41, 0x6e, 0x64, 0x53, 0x69, + 0x67, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x82, 0x01, 0x0a, + 0x17, 0x47, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, + 0x72, 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x12, 0x31, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, 0x74, + 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, 0x65, 0x74, 0x50, + 0x61, 0x72, 0x74, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x32, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, + 0x47, 0x65, 0x74, 0x45, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x72, 0x61, 0x6c, 0x53, 0x65, 0x63, 0x72, + 0x65, 0x74, 0x50, 0x61, 0x72, 0x74, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, + 0x00, 0x12, 0x73, 0x0a, 0x12, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x65, 0x72, 0x4c, 0x65, 0x61, + 0x64, 0x65, 0x72, 0x73, 0x68, 0x69, 0x70, 0x12, 0x2c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, + 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x54, 0x72, 0x61, 0x6e, + 0x73, 0x66, 0x65, 0x72, 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x73, 0x68, 0x69, 0x70, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2d, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, + 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x66, + 0x65, 0x72, 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x73, 0x68, 0x69, 0x70, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x58, 0x0a, 0x09, 0x47, 0x65, 0x74, 0x4c, 0x65, 0x61, + 0x64, 0x65, 0x72, 0x12, 0x23, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x43, 0x6f, 0x73, 0x69, + 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, 0x74, 0x4c, 0x65, 0x61, 0x64, 0x65, + 0x72, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x2e, 0x43, 0x6f, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x72, 0x47, 0x52, 0x50, 0x43, 0x47, 0x65, 0x74, + 0x4c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, + 0x42, 0x36, 0x5a, 0x34, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, + 0x74, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x6c, 0x6f, 0x76, 0x65, 0x2d, 0x76, 0x65, 0x6e, 0x74, 0x75, + 0x72, 0x65, 0x73, 0x2f, 0x68, 0x6f, 0x72, 0x63, 0x72, 0x75, 0x78, 0x2f, 0x73, 0x69, 0x67, 0x6e, + 0x65, 0x72, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -814,7 +910,7 @@ func file_signer_proto_cosigner_grpc_server_proto_rawDescGZIP() []byte { return file_signer_proto_cosigner_grpc_server_proto_rawDescData } -var file_signer_proto_cosigner_grpc_server_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_signer_proto_cosigner_grpc_server_proto_msgTypes = make([]protoimpl.MessageInfo, 13) var file_signer_proto_cosigner_grpc_server_proto_goTypes = []interface{}{ (*Block)(nil), // 0: proto.Block (*CosignerGRPCSignBlockRequest)(nil), // 1: proto.CosignerGRPCSignBlockRequest @@ -827,6 +923,8 @@ var file_signer_proto_cosigner_grpc_server_proto_goTypes = []interface{}{ (*CosignerGRPCGetEphemeralSecretPartsResponse)(nil), // 8: proto.CosignerGRPCGetEphemeralSecretPartsResponse (*CosignerGRPCTransferLeadershipRequest)(nil), // 9: proto.CosignerGRPCTransferLeadershipRequest (*CosignerGRPCTransferLeadershipResponse)(nil), // 10: proto.CosignerGRPCTransferLeadershipResponse + (*CosignerGRPCGetLeaderRequest)(nil), // 11: proto.CosignerGRPCGetLeaderRequest + (*CosignerGRPCGetLeaderResponse)(nil), // 12: proto.CosignerGRPCGetLeaderResponse } var file_signer_proto_cosigner_grpc_server_proto_depIdxs = []int32{ 0, // 0: proto.CosignerGRPCSignBlockRequest.block:type_name -> proto.Block @@ -838,12 +936,14 @@ var file_signer_proto_cosigner_grpc_server_proto_depIdxs = []int32{ 5, // 6: proto.CosignerGRPC.SetEphemeralSecretPartsAndSign:input_type -> proto.CosignerGRPCSetEphemeralSecretPartsAndSignRequest 7, // 7: proto.CosignerGRPC.GetEphemeralSecretParts:input_type -> proto.CosignerGRPCGetEphemeralSecretPartsRequest 9, // 8: proto.CosignerGRPC.TransferLeadership:input_type -> proto.CosignerGRPCTransferLeadershipRequest - 2, // 9: proto.CosignerGRPC.SignBlock:output_type -> proto.CosignerGRPCSignBlockResponse - 6, // 10: proto.CosignerGRPC.SetEphemeralSecretPartsAndSign:output_type -> proto.CosignerGRPCSetEphemeralSecretPartsAndSignResponse - 8, // 11: proto.CosignerGRPC.GetEphemeralSecretParts:output_type -> proto.CosignerGRPCGetEphemeralSecretPartsResponse - 10, // 12: proto.CosignerGRPC.TransferLeadership:output_type -> proto.CosignerGRPCTransferLeadershipResponse - 9, // [9:13] is the sub-list for method output_type - 5, // [5:9] is the sub-list for method input_type + 11, // 9: proto.CosignerGRPC.GetLeader:input_type -> proto.CosignerGRPCGetLeaderRequest + 2, // 10: proto.CosignerGRPC.SignBlock:output_type -> proto.CosignerGRPCSignBlockResponse + 6, // 11: proto.CosignerGRPC.SetEphemeralSecretPartsAndSign:output_type -> proto.CosignerGRPCSetEphemeralSecretPartsAndSignResponse + 8, // 12: proto.CosignerGRPC.GetEphemeralSecretParts:output_type -> proto.CosignerGRPCGetEphemeralSecretPartsResponse + 10, // 13: proto.CosignerGRPC.TransferLeadership:output_type -> proto.CosignerGRPCTransferLeadershipResponse + 12, // 14: proto.CosignerGRPC.GetLeader:output_type -> proto.CosignerGRPCGetLeaderResponse + 10, // [10:15] is the sub-list for method output_type + 5, // [5:10] is the sub-list for method input_type 5, // [5:5] is the sub-list for extension type_name 5, // [5:5] is the sub-list for extension extendee 0, // [0:5] is the sub-list for field type_name @@ -987,6 +1087,30 @@ func file_signer_proto_cosigner_grpc_server_proto_init() { return nil } } + file_signer_proto_cosigner_grpc_server_proto_msgTypes[11].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CosignerGRPCGetLeaderRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_signer_proto_cosigner_grpc_server_proto_msgTypes[12].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CosignerGRPCGetLeaderResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } } type x struct{} out := protoimpl.TypeBuilder{ @@ -994,7 +1118,7 @@ func file_signer_proto_cosigner_grpc_server_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_signer_proto_cosigner_grpc_server_proto_rawDesc, NumEnums: 0, - NumMessages: 11, + NumMessages: 13, NumExtensions: 0, NumServices: 1, }, diff --git a/signer/proto/cosigner_grpc_server.proto b/signer/proto/cosigner_grpc_server.proto index d7c39df2..c2fe1af3 100644 --- a/signer/proto/cosigner_grpc_server.proto +++ b/signer/proto/cosigner_grpc_server.proto @@ -9,6 +9,7 @@ service CosignerGRPC { rpc SetEphemeralSecretPartsAndSign (CosignerGRPCSetEphemeralSecretPartsAndSignRequest) returns (CosignerGRPCSetEphemeralSecretPartsAndSignResponse) {} rpc GetEphemeralSecretParts (CosignerGRPCGetEphemeralSecretPartsRequest) returns (CosignerGRPCGetEphemeralSecretPartsResponse) {} rpc TransferLeadership (CosignerGRPCTransferLeadershipRequest) returns (CosignerGRPCTransferLeadershipResponse) {} + rpc GetLeader (CosignerGRPCGetLeaderRequest) returns (CosignerGRPCGetLeaderResponse) {} } message Block { @@ -70,4 +71,10 @@ message CosignerGRPCTransferLeadershipRequest { message CosignerGRPCTransferLeadershipResponse { string leaderID = 1; string leaderAddress = 2; -} \ No newline at end of file +} + +message CosignerGRPCGetLeaderRequest {} + +message CosignerGRPCGetLeaderResponse { + string leader = 1; +} diff --git a/signer/proto/cosigner_grpc_server_grpc.pb.go b/signer/proto/cosigner_grpc_server_grpc.pb.go index b79d09b0..e8889725 100644 --- a/signer/proto/cosigner_grpc_server_grpc.pb.go +++ b/signer/proto/cosigner_grpc_server_grpc.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: -// - protoc-gen-go-grpc v1.1.0 -// - protoc v3.18.1 +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.21.6 // source: signer/proto/cosigner_grpc_server.proto package proto @@ -26,6 +26,7 @@ type CosignerGRPCClient interface { SetEphemeralSecretPartsAndSign(ctx context.Context, in *CosignerGRPCSetEphemeralSecretPartsAndSignRequest, opts ...grpc.CallOption) (*CosignerGRPCSetEphemeralSecretPartsAndSignResponse, error) GetEphemeralSecretParts(ctx context.Context, in *CosignerGRPCGetEphemeralSecretPartsRequest, opts ...grpc.CallOption) (*CosignerGRPCGetEphemeralSecretPartsResponse, error) TransferLeadership(ctx context.Context, in *CosignerGRPCTransferLeadershipRequest, opts ...grpc.CallOption) (*CosignerGRPCTransferLeadershipResponse, error) + GetLeader(ctx context.Context, in *CosignerGRPCGetLeaderRequest, opts ...grpc.CallOption) (*CosignerGRPCGetLeaderResponse, error) } type cosignerGRPCClient struct { @@ -72,6 +73,15 @@ func (c *cosignerGRPCClient) TransferLeadership(ctx context.Context, in *Cosigne return out, nil } +func (c *cosignerGRPCClient) GetLeader(ctx context.Context, in *CosignerGRPCGetLeaderRequest, opts ...grpc.CallOption) (*CosignerGRPCGetLeaderResponse, error) { + out := new(CosignerGRPCGetLeaderResponse) + err := c.cc.Invoke(ctx, "/proto.CosignerGRPC/GetLeader", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + // CosignerGRPCServer is the server API for CosignerGRPC service. // All implementations must embed UnimplementedCosignerGRPCServer // for forward compatibility @@ -80,6 +90,7 @@ type CosignerGRPCServer interface { SetEphemeralSecretPartsAndSign(context.Context, *CosignerGRPCSetEphemeralSecretPartsAndSignRequest) (*CosignerGRPCSetEphemeralSecretPartsAndSignResponse, error) GetEphemeralSecretParts(context.Context, *CosignerGRPCGetEphemeralSecretPartsRequest) (*CosignerGRPCGetEphemeralSecretPartsResponse, error) TransferLeadership(context.Context, *CosignerGRPCTransferLeadershipRequest) (*CosignerGRPCTransferLeadershipResponse, error) + GetLeader(context.Context, *CosignerGRPCGetLeaderRequest) (*CosignerGRPCGetLeaderResponse, error) mustEmbedUnimplementedCosignerGRPCServer() } @@ -99,6 +110,9 @@ func (UnimplementedCosignerGRPCServer) GetEphemeralSecretParts(context.Context, func (UnimplementedCosignerGRPCServer) TransferLeadership(context.Context, *CosignerGRPCTransferLeadershipRequest) (*CosignerGRPCTransferLeadershipResponse, error) { return nil, status.Errorf(codes.Unimplemented, "method TransferLeadership not implemented") } +func (UnimplementedCosignerGRPCServer) GetLeader(context.Context, *CosignerGRPCGetLeaderRequest) (*CosignerGRPCGetLeaderResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetLeader not implemented") +} func (UnimplementedCosignerGRPCServer) mustEmbedUnimplementedCosignerGRPCServer() {} // UnsafeCosignerGRPCServer may be embedded to opt out of forward compatibility for this service. @@ -184,6 +198,24 @@ func _CosignerGRPC_TransferLeadership_Handler(srv interface{}, ctx context.Conte return interceptor(ctx, in, info, handler) } +func _CosignerGRPC_GetLeader_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(CosignerGRPCGetLeaderRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CosignerGRPCServer).GetLeader(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/proto.CosignerGRPC/GetLeader", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CosignerGRPCServer).GetLeader(ctx, req.(*CosignerGRPCGetLeaderRequest)) + } + return interceptor(ctx, in, info, handler) +} + // CosignerGRPC_ServiceDesc is the grpc.ServiceDesc for CosignerGRPC service. // It's only intended for direct use with grpc.RegisterService, // and not to be introspected or modified (even as a copy) @@ -207,6 +239,10 @@ var CosignerGRPC_ServiceDesc = grpc.ServiceDesc{ MethodName: "TransferLeadership", Handler: _CosignerGRPC_TransferLeadership_Handler, }, + { + MethodName: "GetLeader", + Handler: _CosignerGRPC_GetLeader_Handler, + }, }, Streams: []grpc.StreamDesc{}, Metadata: "signer/proto/cosigner_grpc_server.proto", diff --git a/signer/raft_events.go b/signer/raft_events.go index a4e9aa75..f438a5fb 100644 --- a/signer/raft_events.go +++ b/signer/raft_events.go @@ -46,6 +46,7 @@ func (s *RaftStore) getLeaderGRPCClient() (proto.CosignerGRPCClient, *grpc.Clien time.Sleep(100 * time.Millisecond) } if leader == "" { + totalRaftLeaderElectiontimeout.Inc() return nil, nil, errors.New("timed out waiting for leader election to complete") } conn, err := grpc.Dial(leader, grpc.WithTransportCredentials(insecure.NewCredentials())) diff --git a/signer/raft_store.go b/signer/raft_store.go index 727cb28c..d71e559d 100644 --- a/signer/raft_store.go +++ b/signer/raft_store.go @@ -22,7 +22,7 @@ import ( gRPCTransport "github.com/Jille/raft-grpc-transport" "github.com/Jille/raftadmin" "github.com/hashicorp/raft" - boltdb "github.com/hashicorp/raft-boltdb" + boltdb "github.com/hashicorp/raft-boltdb/v2" proto "github.com/strangelove-ventures/horcrux/signer/proto" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/service" @@ -90,6 +90,7 @@ func (s *RaftStore) init() error { if err != nil { return fmt.Errorf("failed to parse local address: %s, %v", host, err) } + s.logger.Info("Local Raft Listening", "port", port) sock, err := net.Listen("tcp", fmt.Sprintf(":%s", port)) if err != nil { return err diff --git a/signer/remote_cosigner.go b/signer/remote_cosigner.go index e011ee4a..148eb8c7 100644 --- a/signer/remote_cosigner.go +++ b/signer/remote_cosigner.go @@ -10,7 +10,7 @@ import ( "google.golang.org/grpc/credentials/insecure" ) -// RemoteCosigner uses tendermint rpc to request signing from a remote cosigner +// RemoteCosigner uses CosignerGRPC to request signing from a remote cosigner type RemoteCosigner struct { id int address string diff --git a/signer/remote_signer.go b/signer/remote_signer.go index 46555abe..edc99455 100644 --- a/signer/remote_signer.go +++ b/signer/remote_signer.go @@ -77,13 +77,16 @@ func (rs *ReconnRemoteSigner) loop() { proto, address := tmNet.ProtocolAndAddress(rs.address) netConn, err := rs.dialer.Dial(proto, address) if err != nil { + sentryConnectTries.Add(float64(1)) + totalSentryConnectTries.Inc() rs.Logger.Error("Dialing", "err", err) rs.Logger.Info("Retrying", "sleep (s)", 3, "address", rs.address) time.Sleep(time.Second * 3) continue } + sentryConnectTries.Set(0) - rs.Logger.Info("Connected", "address", rs.address) + rs.Logger.Info("Connected to Sentry", "address", rs.address) conn, err = tmP2pConn.MakeSecretConnection(netConn, rs.privKey) if err != nil { conn = nil @@ -147,14 +150,58 @@ func (rs *ReconnRemoteSigner) handleSignVoteRequest(vote *tmProto.Vote) tmProtoP switch typedErr := err.(type) { case *BeyondBlockError: rs.Logger.Debug("Rejecting sign vote request", "reason", typedErr.msg) + beyondBlockErrors.Inc() default: rs.Logger.Error("Failed to sign vote", "address", rs.address, "error", err, "vote_type", vote.Type, "height", vote.Height, "round", vote.Round, "validator", fmt.Sprintf("%X", vote.ValidatorAddress)) + failedSignVote.Inc() } msgSum.SignedVoteResponse.Error = getRemoteSignerError(err) return tmProtoPrivval.Message{Sum: msgSum} } - rs.Logger.Info("Signed vote", "node", rs.address, "height", vote.Height, "round", vote.Round, "type", vote.Type) + // Show signatures provided to each node have the same signature and timestamps + sigLen := 6 + if len(vote.Signature) < sigLen { + sigLen = len(vote.Signature) + } + rs.Logger.Info("Signed vote", "height", vote.Height, "round", vote.Round, "type", vote.Type, + "sig", vote.Signature[:sigLen], "ts", vote.Timestamp.Unix(), "node", rs.address) + + if vote.Type == tmProto.PrecommitType { + stepSize := vote.Height - previousPrecommitHeight + if previousPrecommitHeight != 0 && stepSize > 1 { + missedPrecommits.Add(float64(stepSize)) + totalMissedPrecommits.Add(float64(stepSize)) + } else { + missedPrecommits.Set(0) + } + previousPrecommitHeight = vote.Height // remember last PrecommitHeight + + metricsTimeKeeper.SetPreviousPrecommit(time.Now()) + + lastPrecommitHeight.Set(float64(vote.Height)) + lastPrecommitRound.Set(float64(vote.Round)) + totalPrecommitsSigned.Inc() + } + if vote.Type == tmProto.PrevoteType { + // Determine number of heights since the last Prevote + stepSize := vote.Height - previousPrevoteHeight + if previousPrevoteHeight != 0 && stepSize > 1 { + missedPrevotes.Add(float64(stepSize)) + totalMissedPrevotes.Add(float64(stepSize)) + } else { + missedPrevotes.Set(0) + } + + previousPrevoteHeight = vote.Height // remember last PrevoteHeight + + metricsTimeKeeper.SetPreviousPrevote(time.Now()) + + lastPrevoteHeight.Set(float64(vote.Height)) + lastPrevoteRound.Set(float64(vote.Round)) + totalPrevotesSigned.Inc() + } + msgSum.SignedVoteResponse.Vote = *vote return tmProtoPrivval.Message{Sum: msgSum} } @@ -169,6 +216,7 @@ func (rs *ReconnRemoteSigner) handleSignProposalRequest(proposal *tmProto.Propos switch typedErr := err.(type) { case *BeyondBlockError: rs.Logger.Debug("Rejecting proposal sign request", "reason", typedErr.msg) + beyondBlockErrors.Inc() default: rs.Logger.Error("Failed to sign proposal", "address", rs.address, "error", err, "proposal", proposal) } @@ -177,11 +225,15 @@ func (rs *ReconnRemoteSigner) handleSignProposalRequest(proposal *tmProto.Propos } rs.Logger.Info("Signed proposal", "node", rs.address, "height", proposal.Height, "round", proposal.Round, "type", proposal.Type) + lastProposalHeight.Set(float64(proposal.Height)) + lastProposalRound.Set(float64(proposal.Round)) + totalProposalsSigned.Inc() msgSum.SignedProposalResponse.Proposal = *proposal return tmProtoPrivval.Message{Sum: msgSum} } func (rs *ReconnRemoteSigner) handlePubKeyRequest() tmProtoPrivval.Message { + totalPubKeyRequests.Inc() msgSum := &tmProtoPrivval.Message_PubKeyResponse{PubKeyResponse: &tmProtoPrivval.PubKeyResponse{ PubKey: tmProtoCrypto.PublicKey{}, Error: nil, @@ -219,8 +271,12 @@ func getRemoteSignerError(err error) *tmProtoPrivval.RemoteSignerError { func StartRemoteSigners(services []tmService.Service, logger tmLog.Logger, chainID string, privVal tm.PrivValidator, nodes []NodeConfig) ([]tmService.Service, error) { var err error + go StartMetrics() for _, node := range nodes { - dialer := net.Dialer{Timeout: 30 * time.Second} + // Tendermint requires a connection within 3 seconds of start or crashes + // A long timeout such as 30 seconds would cause the sentry to fail in loops + // Use a short timeout and dial often to connect within 3 second window + dialer := net.Dialer{Timeout: 2 * time.Second} s := NewReconnRemoteSigner(node.Address, logger, chainID, privVal, dialer) err = s.Start() diff --git a/signer/services_test.go b/signer/services_test.go index 0695856e..02115fb4 100644 --- a/signer/services_test.go +++ b/signer/services_test.go @@ -113,9 +113,12 @@ func TestConcurrentStart(t *testing.T) { wg.Add(concurrentAttempts) doneCount := 0 panicCount := 0 + var countMu sync.Mutex recoverFromPanic := func() { _ = recover() + countMu.Lock() + defer countMu.Unlock() panicCount++ if panicCount == concurrentAttempts-1 { for doneCount < concurrentAttempts { diff --git a/signer/threshold_signer.go b/signer/threshold_signer.go index 45029a52..b83ab6b4 100644 --- a/signer/threshold_signer.go +++ b/signer/threshold_signer.go @@ -15,13 +15,13 @@ type ThresholdSigner interface { DealShares(req CosignerGetEphemeralSecretPartRequest) (HrsMetadata, error) - GetEphemeralSecretPart(req CosignerGetEphemeralSecretPartRequest, m *LastSignStateStruct, + GetEphemeralSecretPart(req CosignerGetEphemeralSecretPartRequest, m *LastSignStateWrapper, peers map[int]CosignerPeer) (CosignerEphemeralSecretPart, error) - SetEphemeralSecretPart(req CosignerSetEphemeralSecretPartRequest, m *LastSignStateStruct, + SetEphemeralSecretPart(req CosignerSetEphemeralSecretPartRequest, m *LastSignStateWrapper, peers map[int]CosignerPeer) error - Sign(req CosignerSignRequest, m *LastSignStateStruct) (CosignerSignResponse, error) + Sign(req CosignerSignRequest, m *LastSignStateWrapper) (CosignerSignResponse, error) GetID() (int, error) } diff --git a/signer/threshold_signer_soft.go b/signer/threshold_signer_soft.go index ee774e52..666ab11f 100644 --- a/signer/threshold_signer_soft.go +++ b/signer/threshold_signer_soft.go @@ -18,49 +18,49 @@ import ( // ThresholdSignerSoft implements the interface and signs the message for each local signer. // ThresholdSignerSoft is the implementation of a soft sign signer at the local level. type ThresholdSignerSoft struct { - PubKeyBytes []byte - Key CosignerKey - // Total signers - Total uint8 - Threshold uint8 + pubKeyBytes []byte + key CosignerKey + // total signers + total uint8 + threshold uint8 // Height, Round, Step, Timestamp --> metadata - HrsMeta map[HRSTKey]HrsMetadata + hrsMeta map[HRSTKey]HrsMetadata } // NewThresholdSignerSoft constructs a ThresholdSigner // that signs using the local key share file. func NewThresholdSignerSoft(key CosignerKey, threshold, total uint8) ThresholdSigner { softSigner := &ThresholdSignerSoft{ - Key: key, - HrsMeta: make(map[HRSTKey]HrsMetadata), - Total: total, - Threshold: threshold, + key: key, + hrsMeta: make(map[HRSTKey]HrsMetadata), + total: total, + threshold: threshold, } // cache the public key bytes for signing operations. // Ensures casting else it will naturally panic. - ed25519Key := softSigner.Key.PubKey.(tmcryptoed25519.PubKey) - softSigner.PubKeyBytes = make([]byte, len(ed25519Key)) - copy(softSigner.PubKeyBytes, ed25519Key[:]) + ed25519Key := softSigner.key.PubKey.(tmcryptoed25519.PubKey) + softSigner.pubKeyBytes = make([]byte, len(ed25519Key)) + softSigner.pubKeyBytes = ed25519Key[:] return softSigner } // Implements ThresholdSigner func (softSigner *ThresholdSignerSoft) Type() string { - return "soft" + return SignerTypeSoftSign } // Implements ThresholdSigner func (softSigner *ThresholdSignerSoft) GetID() (int, error) { - return softSigner.Key.ID, nil + return softSigner.key.ID, nil } // Implements ThresholdSigner func (softSigner *ThresholdSignerSoft) Sign( - req CosignerSignRequest, m *LastSignStateStruct) (CosignerSignResponse, error) { - m.LastSignStateMutex.Lock() - defer m.LastSignStateMutex.Unlock() + req CosignerSignRequest, m *LastSignStateWrapper) (CosignerSignResponse, error) { + m.lastSignStateMutex.Lock() + defer m.lastSignStateMutex.Unlock() res := CosignerSignResponse{} lss := m.LastSignState @@ -74,22 +74,21 @@ func (softSigner *ThresholdSignerSoft) Sign( if err != nil { return res, err } - + // If the HRS is the same the sign bytes may still differ by timestamp // It is ok to re-sign a different timestamp if that is the only difference in the sign bytes + // same HRS, and only differ by timestamp its ok to sign again if sameHRS { if bytes.Equal(req.SignBytes, lss.SignBytes) { res.EphemeralPublic = lss.EphemeralPublic res.Signature = lss.Signature return res, nil } else if err := lss.OnlyDifferByTimestamp(req.SignBytes); err != nil { - return res, err + return res, err // same HRS, and only differ by timestamp its ok to sign again } - - // same HRS, and only differ by timestamp - ok to sign again } - meta, ok := softSigner.HrsMeta[hrst] + meta, ok := softSigner.hrsMeta[hrst] if !ok { return res, errors.New("no metadata at HRS") } @@ -109,8 +108,7 @@ func (softSigner *ThresholdSignerSoft) Sign( ephemeralShare := tsed25519.AddScalars(shareParts) ephemeralPublic := tsed25519.AddElements(publicKeys) -// check bounds for ephemeral share to avoid passing out of bounds valids to SignWithShare - + // check bounds for ephemeral share to avoid passing out of bounds valids to SignWithShare if len(ephemeralShare) != 32 { return res, errors.New("ephemeral share is out of bounds") } @@ -122,7 +120,7 @@ func (softSigner *ThresholdSignerSoft) Sign( } sig := tsed25519.SignWithShare( - req.SignBytes, softSigner.Key.ShareKey, ephemeralShare, softSigner.PubKeyBytes, ephemeralPublic) + req.SignBytes, softSigner.key.ShareKey, ephemeralShare, softSigner.pubKeyBytes, ephemeralPublic) m.LastSignState.EphemeralPublic = ephemeralPublic err = m.LastSignState.Save(SignStateConsensus{ @@ -131,19 +129,19 @@ func (softSigner *ThresholdSignerSoft) Sign( Step: hrst.Step, Signature: sig, SignBytes: req.SignBytes, - }, nil, true) // TODO double check true here is correct for async? - + }, nil, true) if err != nil { - if _, isSameHRSError := err.(*SameHRSError); !isSameHRSError { + var isSameHRSError *SameHRSError + if !errors.As(err, &isSameHRSError) { return res, err } } - - for existingKey := range softSigner.HrsMeta { + + for existingKey := range softSigner.hrsMeta { // delete any HRS lower than our signed level // we will not be providing parts for any lower HRS if existingKey.Less(hrst) { - delete(softSigner.HrsMeta, existingKey) + delete(softSigner.hrsMeta, existingKey) } } @@ -155,7 +153,6 @@ func (softSigner *ThresholdSignerSoft) Sign( // Implements ThresholdSigner func (softSigner *ThresholdSignerSoft) DealShares( req CosignerGetEphemeralSecretPartRequest) (HrsMetadata, error) { - hrsKey := HRSTKey{ Height: req.Height, Round: req.Round, @@ -163,8 +160,8 @@ func (softSigner *ThresholdSignerSoft) DealShares( Timestamp: req.Timestamp.UnixNano(), } - meta, ok := softSigner.HrsMeta[hrsKey] + meta, ok := softSigner.hrsMeta[hrsKey] if ok { return meta, nil } @@ -176,14 +173,15 @@ func (softSigner *ThresholdSignerSoft) DealShares( meta = HrsMetadata{ Secret: secret, - Peers: make([]PeerMetadata, softSigner.Total), + Peers: make([]PeerMetadata, softSigner.total), } // split this secret with shamirs // !! dealt shares need to be saved because dealing produces different shares each time! - meta.DealtShares = tsed25519.DealShares(meta.Secret, softSigner.Threshold, softSigner.Total) - softSigner.HrsMeta[hrsKey] = meta + meta.DealtShares = tsed25519.DealShares(meta.Secret, softSigner.threshold, softSigner.total) + + softSigner.hrsMeta[hrsKey] = meta return meta, nil } @@ -192,14 +190,15 @@ func (softSigner *ThresholdSignerSoft) DealShares( // The ephemeral secret part is encrypted for the receiver // Implements ThresholdSigner func (softSigner *ThresholdSignerSoft) GetEphemeralSecretPart( - req CosignerGetEphemeralSecretPartRequest, m *LastSignStateStruct, peers map[int]CosignerPeer) ( + req CosignerGetEphemeralSecretPartRequest, m *LastSignStateWrapper, peers map[int]CosignerPeer) ( + CosignerEphemeralSecretPart, error) { res := CosignerEphemeralSecretPart{} // protects the meta map - m.LastSignStateMutex.Lock() - defer m.LastSignStateMutex.Unlock() + m.lastSignStateMutex.Lock() + defer m.lastSignStateMutex.Unlock() hrst := HRSTKey{ Height: req.Height, @@ -208,7 +207,8 @@ func (softSigner *ThresholdSignerSoft) GetEphemeralSecretPart( Timestamp: req.Timestamp.UnixNano(), } - meta, ok := softSigner.HrsMeta[hrst] + meta, ok := softSigner.hrsMeta[hrst] + // generate metadata placeholder if !ok { newMeta, err := softSigner.DealShares(CosignerGetEphemeralSecretPartRequest{ @@ -223,14 +223,14 @@ func (softSigner *ThresholdSignerSoft) GetEphemeralSecretPart( } meta = newMeta - softSigner.HrsMeta[hrst] = meta + softSigner.hrsMeta[hrst] = meta } ourEphPublicKey := tsed25519.ScalarMultiplyBase(meta.Secret) // set our values - meta.Peers[softSigner.Key.ID-1].Share = meta.DealtShares[softSigner.Key.ID-1] - meta.Peers[softSigner.Key.ID-1].EphemeralSecretPublicKey = ourEphPublicKey + meta.Peers[softSigner.key.ID-1].Share = meta.DealtShares[softSigner.key.ID-1] + meta.Peers[softSigner.key.ID-1].EphemeralSecretPublicKey = ourEphPublicKey // grab the peer info for the ID being requested peer, ok := peers[req.ID] @@ -246,13 +246,13 @@ func (softSigner *ThresholdSignerSoft) GetEphemeralSecretPart( return res, err } - res.SourceID = softSigner.Key.ID + res.SourceID = softSigner.key.ID res.SourceEphemeralSecretPublicKey = ourEphPublicKey res.EncryptedSharePart = encrypted -// sign the response payload with our private key -// cosigners can verify the signature to confirm sender validity - + // sign the response payload with our private key + // cosigners can verify the signature to confirm sender validity + jsonBytes, err := tmjson.Marshal(res) if err != nil { @@ -260,7 +260,7 @@ func (softSigner *ThresholdSignerSoft) GetEphemeralSecretPart( } digest := sha256.Sum256(jsonBytes) - signature, err := rsa.SignPSS(rand.Reader, &softSigner.Key.RSAKey, crypto.SHA256, digest[:], nil) + signature, err := rsa.SignPSS(rand.Reader, &softSigner.key.RSAKey, crypto.SHA256, digest[:], nil) if err != nil { return res, err } @@ -275,17 +275,20 @@ func (softSigner *ThresholdSignerSoft) GetEphemeralSecretPart( // Store an ephemeral secret share part provided by another cosigner (signer) // Implements ThresholdSigner func (softSigner *ThresholdSignerSoft) SetEphemeralSecretPart( - req CosignerSetEphemeralSecretPartRequest, m *LastSignStateStruct, peers map[int]CosignerPeer) error { + req CosignerSetEphemeralSecretPartRequest, m *LastSignStateWrapper, peers map[int]CosignerPeer) error { // Verify the source signature if req.SourceSig == nil { return errors.New("SourceSig field is required") } - digestMsg := CosignerEphemeralSecretPart{} - digestMsg.SourceID = req.SourceID - digestMsg.SourceEphemeralSecretPublicKey = req.SourceEphemeralSecretPublicKey - digestMsg.EncryptedSharePart = req.EncryptedSharePart + digestMsg := CosignerEphemeralSecretPart{ + SourceID: req.SourceID, + // DestinationID: 0, + SourceEphemeralSecretPublicKey: req.SourceEphemeralSecretPublicKey, + EncryptedSharePart: req.EncryptedSharePart, + // SourceSig: []byte{}, + } digestBytes, err := tmjson.Marshal(digestMsg) if err != nil { @@ -306,8 +309,8 @@ func (softSigner *ThresholdSignerSoft) SetEphemeralSecretPart( } // protects the meta map - m.LastSignStateMutex.Lock() - defer m.LastSignStateMutex.Unlock() + m.lastSignStateMutex.Lock() + defer m.lastSignStateMutex.Unlock() hrst := HRSTKey{ Height: req.Height, @@ -316,8 +319,7 @@ func (softSigner *ThresholdSignerSoft) SetEphemeralSecretPart( Timestamp: req.Timestamp.UnixNano(), } - meta, ok := softSigner.HrsMeta[hrst] - // generate metadata placeholder + meta, ok := softSigner.hrsMeta[hrst] // generate metadata placeholder, softSigner.HrsMeta[hrst] is non-addressable if !ok { newMeta, err := softSigner.DealShares(CosignerGetEphemeralSecretPartRequest{ Height: req.Height, @@ -328,19 +330,19 @@ func (softSigner *ThresholdSignerSoft) SetEphemeralSecretPart( if err != nil { return err } - meta = newMeta - softSigner.HrsMeta[hrst] = meta + softSigner.hrsMeta[hrst] = meta // updates the metadata placeholder } // decrypt share - sharePart, err := rsa.DecryptOAEP(sha256.New(), rand.Reader, &softSigner.Key.RSAKey, req.EncryptedSharePart, nil) + sharePart, err := rsa.DecryptOAEP(sha256.New(), rand.Reader, &softSigner.key.RSAKey, req.EncryptedSharePart, nil) if err != nil { return err } - // set slot + // Share & EphemeralSecretPublicKey is a SLICE so its a valid change of the shared struct softSigner! meta.Peers[req.SourceID-1].Share = sharePart meta.Peers[req.SourceID-1].EphemeralSecretPublicKey = req.SourceEphemeralSecretPublicKey + return nil } diff --git a/signer/threshold_validator.go b/signer/threshold_validator.go index bda91608..722a1a8c 100644 --- a/signer/threshold_validator.go +++ b/signer/threshold_validator.go @@ -176,11 +176,20 @@ func (pv *ThresholdValidator) waitForPeerEphemeralShares( encryptedEphemeralSharesThresholdMap *map[Cosigner][]CosignerEphemeralSecretPart, thresholdPeersMutex *sync.Mutex, ) { + peerStartTime := time.Now() ephemeralSecretParts, err := peer.GetEphemeralSecretParts(hrst) if err != nil { + + // Significant missing shares may lead to signature failure + missedEphemeralShares.WithLabelValues(peer.GetAddress()).Add(float64(1)) + totalMissedEphemeralShares.WithLabelValues(peer.GetAddress()).Inc() pv.logger.Error("Error getting secret parts", "peer", peer.GetID(), "err", err) return } + // Significant missing shares may lead to signature failure + missedEphemeralShares.WithLabelValues(peer.GetAddress()).Set(0) + timedCosignerEphemeralShareLag.WithLabelValues(peer.GetAddress()).Observe(time.Since(peerStartTime).Seconds()) + // Check so that getEphemeralWaitGroup.Done is not called more than (threshold - 1) times which causes hardlock thresholdPeersMutex.Lock() if len(*encryptedEphemeralSharesThresholdMap) < pv.threshold-1 { @@ -201,6 +210,7 @@ func (pv *ThresholdValidator) waitForPeerSetEphemeralSharesAndSign( ephemeralPublic *[]byte, wg *sync.WaitGroup, ) { + peerStartTime := time.Now() defer wg.Done() peerEphemeralSecretParts := make([]CosignerEphemeralSecretPart, 0, pv.threshold-1) for _, EncryptedSecrets := range *encryptedEphemeralSharesThresholdMap { @@ -233,6 +243,7 @@ func (pv *ThresholdValidator) waitForPeerSetEphemeralSharesAndSign( return } + timedCosignerSignLag.WithLabelValues(peer.GetAddress()).Observe(time.Since(peerStartTime).Seconds()) pv.logger.Debug(fmt.Sprintf("Received signature from %d", peerID)) shareSignaturesMutex.Lock() @@ -290,6 +301,8 @@ func (pv *ThresholdValidator) getExistingBlockSignature(block *Block) ([]byte, t func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, time.Time, error) { height, round, step, stamp, signBytes := block.Height, block.Round, block.Step, block.Timestamp, block.SignBytes + timeStartSignBlock := time.Now() + // Only the leader can execute this function. Followers can handle the requests, // but they just need to proxy the request to the raft leader if pv.raftStore.raft == nil { @@ -297,6 +310,7 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t } if pv.raftStore.raft.State() != raft.Leader { pv.logger.Debug("I am not the raft leader. Proxying request to the leader") + totalNotRaftLeader.Inc() signRes, err := pv.raftStore.LeaderSignBlock(CosignerSignBlockRequest{chainID, block}) if err != nil { if _, ok := err.(*rpcTypes.RPCError); ok { @@ -311,6 +325,7 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t return signRes.Signature, stamp, nil } + totalRaftLeader.Inc() pv.logger.Debug("I am the raft leader. Managing the sign process for this block") hrst := HRSTKey{ @@ -394,6 +409,7 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t encryptedEphemeralSharesThresholdMap[pv.cosigner] = ourEphemeralSecretParts.EncryptedSecrets thresholdPeersMutex.Unlock() + timedSignBlockThresholdLag.Observe(time.Since(timeStartSignBlock).Seconds()) pv.logger.Debug("Have threshold peers") setEphemeralAndSignWaitGroup := sync.WaitGroup{} @@ -421,6 +437,7 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t return nil, stamp, errors.New("timed out waiting for peers to sign") } + timedSignBlockCosignerLag.Observe(time.Since(timeStartSignBlock).Seconds()) pv.logger.Debug("Done waiting for cosigners, assembling signatures") // collect all valid responses into array of ids and signatures for the threshold lib @@ -438,6 +455,7 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t } if len(sigIds) < pv.threshold { + totalInsufficientCosigners.Inc() return nil, stamp, errors.New("not enough co-signers") } @@ -449,6 +467,7 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t // verify the combined signature before saving to watermark if !pv.pubkey.VerifySignature(signBytes, signature) { + totalInvalidSignature.Inc() return nil, stamp, errors.New("combined signature is not valid") } @@ -473,5 +492,8 @@ func (pv *ThresholdValidator) SignBlock(chainID string, block *Block) ([]byte, t pv.logger.Error("Error emitting LSS", err.Error()) } + timeSignBlock := time.Since(timeStartSignBlock).Seconds() + timedSignBlockLag.Observe(timeSignBlock) + return signature, stamp, nil } diff --git a/test/horcrux_test.go b/test/horcrux_test.go index b4b66a77..2671408d 100644 --- a/test/horcrux_test.go +++ b/test/horcrux_test.go @@ -27,17 +27,18 @@ func Test4Of7SignerTwoSentries(t *testing.T) { chain := getSimdChain() // setup a horcrux validator for us - ourValidator, err := NewHorcruxValidator(t, pool, home, chainID, 0, totalSentries, totalSigners, threshold, chain) + ourValidator, err := NewHorcruxValidator(t, pool, network, home, + chainID, 0, totalSentries, totalSigners, threshold, chain) require.NoError(t, err) // other vals are single node (non-horcrux) - otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, t) + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) // start our validator's horcrux cluster - require.NoError(t, ourValidator.StartHorcruxCluster(ctx, network, sentriesPerSigner)) + require.NoError(t, ourValidator.StartHorcruxCluster(ctx, sentriesPerSigner)) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) + require.NoError(t, Genesis(t, ctx, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) // Wait for all nodes to get to given block height require.NoError(t, GetAllNodes(otherValidatorNodes, ourValidator.Sentries).WaitForHeight(5)) @@ -61,17 +62,18 @@ func Test2Of3SignerTwoSentries(t *testing.T) { chain := getSimdChain() // setup a horcrux validator for us - ourValidator, err := NewHorcruxValidator(t, pool, home, chainID, 0, totalSentries, totalSigners, threshold, chain) + ourValidator, err := NewHorcruxValidator(t, pool, network, home, + chainID, 0, totalSentries, totalSigners, threshold, chain) require.NoError(t, err) // remaining validators are single-node non-horcrux - otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, t) + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) // start our validator's horcrux cluster - require.NoError(t, ourValidator.StartHorcruxCluster(ctx, network, sentriesPerSigner)) + require.NoError(t, ourValidator.StartHorcruxCluster(ctx, sentriesPerSigner)) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) + require.NoError(t, Genesis(t, ctx, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) // Wait for all nodes to get to given block height require.NoError(t, GetAllNodes(otherValidatorNodes, ourValidator.Sentries).WaitForHeight(5)) @@ -95,17 +97,18 @@ func Test2Of3SignerUniqueSentry(t *testing.T) { chain := getSimdChain() // setup a horcrux validator for us - ourValidator, err := NewHorcruxValidator(t, pool, home, chainID, 0, totalSentries, totalSigners, threshold, chain) + ourValidator, err := NewHorcruxValidator(t, pool, network, home, + chainID, 0, totalSentries, totalSigners, threshold, chain) require.NoError(t, err) // remaining validators are single-node non-horcrux - otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, t) + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) // start our validator's horcrux cluster - require.NoError(t, ourValidator.StartHorcruxCluster(ctx, network, sentriesPerSigner)) + require.NoError(t, ourValidator.StartHorcruxCluster(ctx, sentriesPerSigner)) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) + require.NoError(t, Genesis(t, ctx, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) // Wait for all nodes to get to given block height require.NoError(t, GetAllNodes(otherValidatorNodes, ourValidator.Sentries).WaitForHeight(5)) @@ -127,13 +130,13 @@ func TestSingleSignerTwoSentries(t *testing.T) { chain := getSimdChain() // get total sentries nodes for our validator - ourValidatorNodes := GetValidators(0, 1, totalSentries, home, chainID, chain, pool, t) + ourValidatorNodes := GetValidators(0, 1, totalSentries, home, chainID, chain, pool, network, t) // using the first node for account and consensus key to create gentx ourValidatorAccountNode := ourValidatorNodes[0] // other vals are single node (non-horcrux) - otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, t) + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) // nodes that will be used for account and consensus key to create gentx validatorAccountNodes := GetAllNodes([]*TestNode{ourValidatorAccountNode}, otherValidatorNodes) @@ -144,10 +147,10 @@ func TestSingleSignerTwoSentries(t *testing.T) { sentries := []*TestNode{ourValidatorNodes[1]} // initialize horcrux signer nodes for our validator - signers := MakeTestSigners(0, totalSigners, home, pool, t) + signers := MakeTestSigners(0, totalSigners, home, pool, network, t) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, validatorAccountNodes, sentries, []*TestValidator{})) + require.NoError(t, Genesis(t, ctx, chain, validatorAccountNodes, sentries, []*TestValidator{})) allNodes := GetAllNodes(validatorAccountNodes, sentries) @@ -155,7 +158,7 @@ func TestSingleSignerTwoSentries(t *testing.T) { require.NoError(t, allNodes.WaitForHeight(5)) // start remote signer - require.NoError(t, StartSingleSignerContainers(signers, ourValidatorAccountNode, ourValidatorNodes, network)) + require.NoError(t, StartSingleSignerContainers(signers, ourValidatorAccountNode, ourValidatorNodes)) // Stop the validator node and sentry node before upgrading to horcrux t.Logf("{%s} -> Stopping Node...", ourValidatorAccountNode.Name()) @@ -178,11 +181,8 @@ func TestSingleSignerTwoSentries(t *testing.T) { t.Logf("{%s} -> Restarting Node...", ourValidatorAccountNode.Name()) t.Logf("{%s} -> Restarting Node...", sentries[0].Name()) - require.NoError(t, ourValidatorAccountNode.CreateNodeContainer(network.ID)) - require.NoError(t, sentries[0].CreateNodeContainer(network.ID)) - - require.NoError(t, ourValidatorAccountNode.StartContainer(ctx)) - require.NoError(t, sentries[0].StartContainer(ctx)) + require.NoError(t, ourValidatorAccountNode.Start(ctx, nil)) + require.NoError(t, sentries[0].Start(ctx, nil)) // wait for our validator and all sentries to be reachable hosts := ourValidatorAccountNode.GetHosts() @@ -210,13 +210,13 @@ func TestUpgradeValidatorToHorcrux(t *testing.T) { chain := getSimdChain() // initially all vals are single node (non-horcrux) - validators := GetValidators(0, totalValidators, 1, home, chainID, chain, pool, t) + validators := GetValidators(0, totalValidators, 1, home, chainID, chain, pool, network, t) // for this test we will upgrade the first validator to horcrux ourValidatorNode := validators[0] // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, validators, []*TestNode{}, []*TestValidator{})) + require.NoError(t, Genesis(t, ctx, chain, validators, []*TestNode{}, []*TestValidator{})) // Wait for all validators to get to given block height require.NoError(t, validators.WaitForHeight(5)) @@ -226,7 +226,7 @@ func TestUpgradeValidatorToHorcrux(t *testing.T) { require.NoError(t, err) // create horcrux validator with same consensus key - ourValidatorUpgradedToHorcrux, err := NewHorcruxValidatorWithPrivValKey(t, pool, home, + ourValidatorUpgradedToHorcrux, err := NewHorcruxValidatorWithPrivValKey(t, pool, network, home, chainID, 0, 0, totalSigners, threshold, getSimdChain(), ourValidatorPrivValKey) require.NoError(t, err) @@ -247,11 +247,10 @@ func TestUpgradeValidatorToHorcrux(t *testing.T) { ourValidatorNode.GenNewPrivVal() // start our new validator - require.NoError(t, ourValidatorUpgradedToHorcrux.StartHorcruxCluster(ctx, network, sentriesPerSigner)) + require.NoError(t, ourValidatorUpgradedToHorcrux.StartHorcruxCluster(ctx, sentriesPerSigner)) t.Logf("{%s} -> Restarting Node...", ourValidatorNode.Name()) - require.NoError(t, ourValidatorNode.CreateNodeContainer(network.ID)) - require.NoError(t, ourValidatorNode.StartContainer(ctx)) + require.NoError(t, ourValidatorNode.Start(ctx, nil)) // wait for validator to be reachable require.NoError(t, ourValidatorNode.GetHosts().WaitForAllToStart(t, 10)) @@ -272,24 +271,22 @@ func TestDownedSigners2of3(t *testing.T) { chain := getSimdChain() // setup a horcrux validator for us - ourValidator, err := NewHorcruxValidator(t, pool, home, chainID, 0, totalSentries, totalSigners, threshold, chain) + ourValidator, err := NewHorcruxValidator(t, pool, network, home, + chainID, 0, totalSentries, totalSigners, threshold, chain) require.NoError(t, err) // remaining validators are single-node non-horcrux - otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, t) + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) // start our validator's horcrux cluster - require.NoError(t, ourValidator.StartHorcruxCluster(ctx, network, sentriesPerSigner)) + require.NoError(t, ourValidator.StartHorcruxCluster(ctx, sentriesPerSigner)) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) + require.NoError(t, Genesis(t, ctx, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) // Wait for all nodes to get to given block height require.NoError(t, GetAllNodes(otherValidatorNodes, ourValidator.Sentries).WaitForHeight(5)) - t.Logf("{%s} -> Checking that slashing has not occurred...", ourValidator.Name()) - require.NoError(t, ourValidator.EnsureNotSlashed()) - // Test taking down each node in the signer cluster for a period of time for _, signer := range ourValidator.Signers { t.Logf("{%s} -> Stopping signer...", signer.Name()) @@ -299,7 +296,7 @@ func TestDownedSigners2of3(t *testing.T) { require.NoError(t, ourValidator.WaitForConsecutiveBlocks(10)) t.Logf("{%s} -> Restarting signer...", signer.Name()) - require.NoError(t, signer.CreateCosignerContainer(network.ID)) + require.NoError(t, signer.CreateCosignerContainer()) require.NoError(t, signer.StartContainer()) require.NoError(t, signer.GetHosts().WaitForAllToStart(t, 10)) // Wait to ensure signer is back up require.NoError(t, ourValidator.WaitForConsecutiveBlocks(10)) @@ -308,6 +305,56 @@ func TestDownedSigners2of3(t *testing.T) { require.NoError(t, ourValidator.EnsureNotSlashed()) } +func TestLeaderElection2of3(t *testing.T) { + t.Parallel() + ctx, home, pool, network := SetupTestRun(t) + + const totalValidators = 4 + const totalSigners = 3 + const totalSentries = 2 + const threshold = 2 + const sentriesPerSigner = 3 + chain := getSimdChain() + + // setup a horcrux validator for us + ourValidator, err := NewHorcruxValidator(t, pool, network, home, + chainID, 0, totalSentries, totalSigners, threshold, chain) + require.NoError(t, err) + + // remaining validators are single-node non-horcrux + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) + + // start our validator's horcrux cluster + require.NoError(t, ourValidator.StartHorcruxCluster(ctx, sentriesPerSigner)) + + // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain + require.NoError(t, Genesis(t, ctx, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) + + // Wait for all nodes to get to given block height + require.NoError(t, GetAllNodes(otherValidatorNodes, ourValidator.Sentries).WaitForHeight(5)) + + // Test electing each node in the signer cluster for a period of time + for _, signer := range ourValidator.Signers { + t.Logf("{%s} -> Electing leader...", signer.Name()) + err := signer.TransferLeadership(ctx, signer.Index) + require.NoError(t, err, "failed to transfer leadership to %d", signer.Index) + + t.Logf("{%s} -> Waiting for signed blocks with signer as leader {%s}", ourValidator.Name(), signer.Name()) + require.NoError(t, ourValidator.WaitForConsecutiveBlocks(2)) + + // Make sure all cosigners have the same leader + for _, s := range ourValidator.Signers { + leader, err := s.GetLeader(ctx) + require.NoError(t, err, "failed to get leader from signer: %s", s.Name()) + require.Equal(t, signer.Name()+":"+signerPort, leader) + } + + require.NoError(t, ourValidator.WaitForConsecutiveBlocks(8)) + } + t.Logf("{%s} -> Checking that slashing has not occurred...", ourValidator.Name()) + require.NoError(t, ourValidator.EnsureNotSlashed()) +} + func TestDownedSigners3of5(t *testing.T) { t.Parallel() ctx, home, pool, network := SetupTestRun(t) @@ -320,24 +367,22 @@ func TestDownedSigners3of5(t *testing.T) { chain := getSimdChain() // setup a horcrux validator for us - ourValidator, err := NewHorcruxValidator(t, pool, home, chainID, 0, totalSentries, totalSigners, threshold, chain) + ourValidator, err := NewHorcruxValidator(t, pool, network, home, + chainID, 0, totalSentries, totalSigners, threshold, chain) require.NoError(t, err) // remaining validators are single-node non-horcrux - otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, t) + otherValidatorNodes := GetValidators(1, totalValidators-1, 1, home, chainID, chain, pool, network, t) // start our validator's horcrux cluster - require.NoError(t, ourValidator.StartHorcruxCluster(ctx, network, sentriesPerSigner)) + require.NoError(t, ourValidator.StartHorcruxCluster(ctx, sentriesPerSigner)) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) + require.NoError(t, Genesis(t, ctx, chain, otherValidatorNodes, []*TestNode{}, []*TestValidator{ourValidator})) // Wait for all nodes to get to given block height require.NoError(t, GetAllNodes(otherValidatorNodes, ourValidator.Sentries).WaitForHeight(5)) - t.Logf("{%s} -> Checking that slashing has not occurred...", ourValidator.Name()) - require.NoError(t, ourValidator.EnsureNotSlashed()) - // Test taking down 2 nodes at a time in the signer cluster for a period of time for i := 0; i < len(ourValidator.Signers); i++ { signer1 := ourValidator.Signers[i] @@ -362,7 +407,7 @@ func TestDownedSigners3of5(t *testing.T) { require.NoError(t, ourValidator.WaitForConsecutiveBlocks(10)) t.Logf("{%s} -> Restarting signer...", signer1.Name()) - require.NoError(t, signer1.CreateCosignerContainer(network.ID)) + require.NoError(t, signer1.CreateCosignerContainer()) require.NoError(t, signer1.StartContainer()) require.NoError(t, signer1.GetHosts().WaitForAllToStart(t, 10)) // Wait to ensure signer is back up require.NoError(t, ourValidator.WaitForConsecutiveBlocks(10)) @@ -396,20 +441,20 @@ func TestChainPureHorcrux(t *testing.T) { // start horcrux cluster for each validator for i := 0; i < totalValidators; i++ { - validator, err := NewHorcruxValidator(t, pool, home, chainID, i, + validator, err := NewHorcruxValidator(t, pool, network, home, chainID, i, sentriesPerValidator, signersPerValidator, threshold, chain) require.NoError(t, err) validators = append(validators, validator) allNodes = append(allNodes, validator.Sentries...) startValidatorsErrGroup.Go(func() error { - return validator.StartHorcruxCluster(ctx, network, sentriesPerSigner) + return validator.StartHorcruxCluster(ctx, sentriesPerSigner) }) } require.NoError(t, startValidatorsErrGroup.Wait()) // assemble and combine gentx to get genesis file, configure peering between sentries, then start the chain - require.NoError(t, Genesis(t, ctx, network, chain, []*TestNode{}, []*TestNode{}, validators)) + require.NoError(t, Genesis(t, ctx, chain, []*TestNode{}, []*TestNode{}, validators)) require.NoError(t, allNodes.WaitForHeight(5)) diff --git a/test/test_node.go b/test/test_node.go index 04898dd1..6d5cfa54 100644 --- a/test/test_node.go +++ b/test/test_node.go @@ -96,7 +96,8 @@ func getSentinelChain(ctx context.Context, version string) *ChainType { return err } command := []string{"sed", "-i", fmt.Sprintf(`s/"approve_by": ""/"approve_by": "%s"/g`, address), genesisJSON} - return handleNodeJobError(tn.NodeJob(ctx, command)) + _, _, err = tn.Exec(ctx, command) + return err } return getHeighlinerChain("sentinel", version, "sentinelhub", "sent", true, sentinelGenesisJSONModification) @@ -127,6 +128,7 @@ type TestNode struct { GenesisCoins string Validator bool Pool *dockertest.Pool + networkID string Client rpcclient.Client Container *docker.Container tl TestLogger @@ -162,6 +164,7 @@ func MakeTestNodes( chainID string, chainType *ChainType, pool *dockertest.Pool, + networkID string, tl TestLogger, ) (out TestNodes) { err := pool.Client.PullImage(docker.PullImageOptions{ @@ -173,7 +176,7 @@ func MakeTestNodes( } for i := 0; i < count; i++ { tn := &TestNode{Home: home, Index: i, ValidatorIndex: validatorIndex, Chain: chainType, ChainID: chainID, - Pool: pool, tl: tl, ec: simapp.MakeTestEncodingConfig()} + Pool: pool, networkID: networkID, tl: tl, ec: simapp.MakeTestEncodingConfig()} tn.MkDir() out = append(out, tn) } @@ -189,10 +192,11 @@ func GetValidators( chainID string, chain *ChainType, pool *dockertest.Pool, + networkID string, t *testing.T, ) (out TestNodes) { for i := startingValidatorIndex; i < startingValidatorIndex+count; i++ { - out = append(out, MakeTestNodes(i, sentriesPerValidator, home, chainID, chain, pool, t)...) + out = append(out, MakeTestNodes(i, sentriesPerValidator, home, chainID, chain, pool, networkID, t)...) } return } @@ -383,7 +387,7 @@ func (tn *TestNode) GetMostRecentConsecutiveSignedBlocks( var status *ctypes.ResultStatus status, err = tn.Client.Status(context.Background()) if err != nil { - return + return 0, 0, err } latestHeight = status.SyncInfo.LatestBlockHeight @@ -392,16 +396,21 @@ func (tn *TestNode) GetMostRecentConsecutiveSignedBlocks( var block *ctypes.ResultBlock block, err = tn.Client.Block(context.Background(), &i) if err != nil { - return + return 0, 0, err } + found := false for _, voter := range block.Block.LastCommit.Signatures { if reflect.DeepEqual(voter.ValidatorAddress, address) { count++ + found = true break } } + if !found { + return count, latestHeight, nil + } } - return + return count, latestHeight, nil } func (tn *TestNode) getMissingBlocks(address tmBytes.HexBytes) (int64, error) { @@ -471,6 +480,10 @@ func (tn *TestNode) WaitForConsecutiveBlocks(blocks int64, address tmBytes.HexBy if err != nil { continue } + if recentSignedBlocksCount > 0 { + // we signed a block within window, so restart counter + i = -1 + } deltaMissed := min(blocks, checkingBlock-1) - recentSignedBlocksCount deltaBlocks := checkingBlock - startingBlock @@ -507,9 +520,9 @@ func stdconfigchanges(cfg *tmconfig.Config, peers string, enablePrivVal bool) { cfg.P2P.PersistentPeers = peers } -// NodeJob run a container for a specific job and block until the container exits +// Exec runs a container for a specific job and block until the container exits // NOTE: on job containers generate random name -func (tn *TestNode) NodeJob(ctx context.Context, cmd []string) (string, int, string, string, error) { +func (tn *TestNode) Exec(ctx context.Context, cmd []string) (string, string, error) { container := RandLowerCaseLetterString(10) tn.tl.Logf("{%s}[%s] -> '%s'", tn.Name(), container, strings.Join(cmd, " ")) cont, err := tn.Pool.Client.CreateContainer(docker.CreateContainerOptions{ @@ -534,76 +547,136 @@ func (tn *TestNode) NodeJob(ctx context.Context, cmd []string) (string, int, str Context: nil, }) if err != nil { - return container, 1, "", "", err + return "", "", err } if err := tn.Pool.Client.StartContainer(cont.ID, nil); err != nil { - return container, 1, "", "", err + return "", "", err } exitCode, err := tn.Pool.Client.WaitContainerWithContext(cont.ID, ctx) - stdout := new(bytes.Buffer) - stderr := new(bytes.Buffer) + outputStream := new(bytes.Buffer) + errorStream := new(bytes.Buffer) _ = tn.Pool.Client.Logs(docker.LogsOptions{ Context: ctx, Container: cont.ID, - OutputStream: stdout, - ErrorStream: stderr, + OutputStream: outputStream, + ErrorStream: errorStream, Stdout: true, Stderr: true, Tail: "100", Follow: false, Timestamps: false, }) + stdout := outputStream.String() + stderr := errorStream.String() _ = tn.Pool.Client.RemoveContainer(docker.RemoveContainerOptions{ID: cont.ID}) - return container, exitCode, stdout.String(), stderr.String(), err + return stdout, stderr, containerExitError(container, exitCode, stdout, stderr, err) } // InitHomeFolder initializes a home folder for the given node func (tn *TestNode) InitHomeFolder(ctx context.Context) error { - command := []string{tn.Chain.Bin, "init", tn.Name(), + cmd := []string{tn.Chain.Bin, "init", tn.Name(), "--chain-id", tn.ChainID, "--home", tn.NodeHome(), } - return handleNodeJobError(tn.NodeJob(ctx, command)) + _, _, err := tn.Exec(ctx, cmd) + return err } // CreateKey creates a key in the keyring backend test for the given node func (tn *TestNode) CreateKey(ctx context.Context, name string) error { - command := []string{tn.Chain.Bin, "keys", "add", name, + cmd := []string{tn.Chain.Bin, "keys", "add", name, "--keyring-backend", "test", "--output", "json", "--home", tn.NodeHome(), } - return handleNodeJobError(tn.NodeJob(ctx, command)) + _, _, err := tn.Exec(ctx, cmd) + return err } // AddGenesisAccount adds a genesis account for each key func (tn *TestNode) AddGenesisAccount(ctx context.Context, address string) error { - command := []string{tn.Chain.Bin, "add-genesis-account", address, "1000000000000stake", + cmd := []string{tn.Chain.Bin, "add-genesis-account", address, "1000000000000stake", "--home", tn.NodeHome(), } - return handleNodeJobError(tn.NodeJob(ctx, command)) + _, _, err := tn.Exec(ctx, cmd) + return err } // Gentx generates the gentx for a given node func (tn *TestNode) Gentx(ctx context.Context, name, pubKey string) error { - command := []string{tn.Chain.Bin, "gentx", valKey, "100000000000stake", + cmd := []string{tn.Chain.Bin, "gentx", valKey, "100000000000stake", "--pubkey", pubKey, "--keyring-backend", "test", "--home", tn.NodeHome(), "--chain-id", tn.ChainID, } - return handleNodeJobError(tn.NodeJob(ctx, command)) + _, _, err := tn.Exec(ctx, cmd) + return err } // CollectGentxs runs collect gentxs on the node's home folders func (tn *TestNode) CollectGentxs(ctx context.Context) error { - command := []string{tn.Chain.Bin, "collect-gentxs", + cmd := []string{tn.Chain.Bin, "collect-gentxs", "--home", tn.NodeHome(), } - return handleNodeJobError(tn.NodeJob(ctx, command)) + _, _, err := tn.Exec(ctx, cmd) + return err +} + +func (tn *TestNode) Start(ctx context.Context, preStart func()) error { + // Retry loop for running container. + err := retry.Do(func() error { + // forcefully remove existing container, ignoring error + _ = tn.StopAndRemoveContainer(true) + if err := tn.createContainer(); err != nil { + return err + } + if preStart != nil { + preStart() + } + if err := tn.startContainer(ctx); err != nil { + return err + } + + for i := 0; i < 10; i++ { + container, err := tn.Pool.Client.InspectContainer(tn.Container.ID) + if err != nil { + return err + } + if !container.State.Running { + return fmt.Errorf("container is not running") + } + + ctx, cancel := context.WithTimeout(ctx, 1*time.Second) + _, err = tn.Client.Status(ctx) + cancel() + if err == nil { + return nil + } + time.Sleep(1 * time.Second) + } + + return fmt.Errorf("node is running but not responding with status") + }, retry.DelayType(retry.FixedDelay), retry.Attempts(5)) + if err != nil { + return fmt.Errorf("error starting node container after max retries: %w", err) + } + + // Retry loop for in sync with chain + return retry.Do(func() error { + stat, err := tn.Client.Status(ctx) + if err != nil { + return err + } + if stat != nil && stat.SyncInfo.CatchingUp { + return fmt.Errorf("still catching up: height(%d) catching-up(%t)", + stat.SyncInfo.LatestBlockHeight, stat.SyncInfo.CatchingUp) + } + return nil + }, retry.DelayType(retry.BackOffDelay)) } -func (tn *TestNode) CreateNodeContainer(networkID string) error { +func (tn *TestNode) createContainer() error { cont, err := tn.Pool.Client.CreateContainer(docker.CreateContainerOptions{ Name: tn.Name(), Config: &docker.Config{ @@ -622,7 +695,7 @@ func (tn *TestNode) CreateNodeContainer(networkID string) error { }, NetworkingConfig: &docker.NetworkingConfig{ EndpointsConfig: map[string]*docker.EndpointConfig{ - networkID: {}, + tn.networkID: {}, }, }, Context: nil, @@ -634,12 +707,14 @@ func (tn *TestNode) CreateNodeContainer(networkID string) error { return nil } -func (tn *TestNode) StopContainer() error { - return tn.Pool.Client.StopContainer(tn.Container.ID, 60) -} - +// StopAndRemoveContainer stops and removes a TestSigners docker container. +// If force is true, error for stopping container will be ignored and container +// will be forcefully removed. func (tn *TestNode) StopAndRemoveContainer(force bool) error { - if err := tn.StopContainer(); err != nil && !force { + if tn.Container == nil { + return nil + } + if err := tn.Pool.Client.StopContainer(tn.Container.ID, 60); err != nil && !force { return err } return tn.Pool.Client.RemoveContainer(docker.RemoveContainerOptions{ @@ -648,7 +723,7 @@ func (tn *TestNode) StopAndRemoveContainer(force bool) error { }) } -func (tn *TestNode) StartContainer(ctx context.Context) error { +func (tn *TestNode) startContainer(ctx context.Context) error { if err := tn.Pool.Client.StartContainer(tn.Container.ID, nil); err != nil { return err } @@ -662,25 +737,7 @@ func (tn *TestNode) StartContainer(ctx context.Context) error { port := GetHostPort(c, "26657/tcp") tn.tl.Logf("{%s} RPC => %s", tn.Name(), port) - err = tn.NewClient(fmt.Sprintf("tcp://%s", port)) - if err != nil { - return err - } - - time.Sleep(5 * time.Second) - return retry.Do(func() error { - stat, err := tn.Client.Status(ctx) - if err != nil { - // tn.t.Log(err) - return err - } - // TODO: reenable this check, having trouble with it for some reason - if stat != nil && stat.SyncInfo.CatchingUp { - return fmt.Errorf("still catching up: height(%d) catching-up(%t)", - stat.SyncInfo.LatestBlockHeight, stat.SyncInfo.CatchingUp) - } - return nil - }, retry.DelayType(retry.BackOffDelay)) + return tn.NewClient(fmt.Sprintf("tcp://%s", port)) } func (tn *TestNode) Bech32AddressForKey(keyName string) (string, error) { @@ -738,7 +795,7 @@ func (tn *TestNode) InitFullNodeFiles(ctx context.Context) error { return tn.InitHomeFolder(ctx) } -func handleNodeJobError(container string, i int, stdout string, stderr string, err error) error { +func containerExitError(container string, i int, stdout string, stderr string, err error) error { if err != nil { return fmt.Errorf("%v\n%s\n%s", err, stdout, stderr) } diff --git a/test/test_setup.go b/test/test_setup.go index aa2a68e6..96c7a161 100644 --- a/test/test_setup.go +++ b/test/test_setup.go @@ -24,17 +24,17 @@ type TestLogger interface { Logf(string, ...interface{}) } -func SetupTestRun(t *testing.T) (context.Context, string, *dockertest.Pool, *docker.Network) { +func SetupTestRun(t *testing.T) (context.Context, string, *dockertest.Pool, string) { home := t.TempDir() pool, err := dockertest.NewPool("") require.NoError(t, err) // set the test cleanup function - t.Cleanup(Cleanup(pool, t.Name(), home)) + t.Cleanup(Cleanup(pool, t, home)) // run cleanup to cleanup stale resources from any killed tests - Cleanup(pool, t.Name(), home)() + Cleanup(pool, t, home)() network, err := CreateTestNetwork(pool, fmt.Sprintf("horcrux-%s", RandLowerCaseLetterString(8)), t) require.NoError(t, err) @@ -42,14 +42,13 @@ func SetupTestRun(t *testing.T) (context.Context, string, *dockertest.Pool, *doc // build the horcrux image require.NoError(t, BuildTestSignerImage(pool)) - return context.Background(), home, pool, network + return context.Background(), home, pool, network.ID } // assemble gentx, build genesis file, configure peering, and start chain func Genesis( tl TestLogger, ctx context.Context, - net *docker.Network, chain *ChainType, nonHorcruxValidators, fullnodes []*TestNode, @@ -166,16 +165,6 @@ func Genesis( return err } - for _, n := range nodes { - n := n - eg.Go(func() error { - return n.CreateNodeContainer(net.ID) - }) - } - if err := eg.Wait(); err != nil { - return err - } - peers := nodes.PeerString() // start horcrux sentries. privval listener enabled @@ -184,8 +173,9 @@ func Genesis( s := sentry tl.Logf("{%s} => starting container...", s.Name()) eg.Go(func() error { - s.SetValidatorConfigAndPeers(peers, true) - return s.StartContainer(ctx) + return s.Start(ctx, func() { + s.SetValidatorConfigAndPeers(peers, true) + }) }) } } @@ -195,8 +185,9 @@ func Genesis( v := v tl.Logf("{%s} => starting container...", v.Name()) eg.Go(func() error { - v.SetValidatorConfigAndPeers(peers, false) - return v.StartContainer(ctx) + return v.Start(ctx, func() { + v.SetValidatorConfigAndPeers(peers, false) + }) }) } @@ -205,8 +196,9 @@ func Genesis( n := n tl.Logf("{%s} => starting container...", n.Name()) eg.Go(func() error { - n.SetValidatorConfigAndPeers(peers, false) - return n.StartContainer(ctx) + return n.Start(ctx, func() { + n.SetValidatorConfigAndPeers(peers, false) + }) }) } @@ -244,16 +236,16 @@ func CreateTestNetwork(pool *dockertest.Pool, name string, t *testing.T) (*docke } // Cleanup will clean up Docker containers, networks, and the other various config files generated in testing -func Cleanup(pool *dockertest.Pool, testName, testDir string) func() { +func Cleanup(pool *dockertest.Pool, t *testing.T, testDir string) func() { return func() { cont, _ := pool.Client.ListContainers(docker.ListContainersOptions{All: true}) ctx := context.Background() for _, c := range cont { for k, v := range c.Labels { - if k == "horcrux-test" && v == testName { + if k == "horcrux-test" && v == t.Name() { _ = pool.Client.StopContainer(c.ID, 10) - _, err := pool.Client.WaitContainerWithContext(c.ID, ctx) - if err != nil { + _, _ = pool.Client.WaitContainerWithContext(c.ID, ctx) + if t.Failed() { stdout := new(bytes.Buffer) stderr := new(bytes.Buffer) _ = pool.Client.Logs(docker.LogsOptions{ @@ -276,7 +268,7 @@ func Cleanup(pool *dockertest.Pool, testName, testDir string) func() { nets, _ := pool.Client.ListNetworks() for _, n := range nets { for k, v := range n.Labels { - if k == "horcrux-test" && v == testName { + if k == "horcrux-test" && v == t.Name() { _ = pool.Client.RemoveNetwork(n.ID) } } diff --git a/test/test_signer.go b/test/test_signer.go index 0a04c26b..f22c16b9 100644 --- a/test/test_signer.go +++ b/test/test_signer.go @@ -8,18 +8,25 @@ import ( "os" "path" "path/filepath" + "strconv" "strings" + "time" "github.com/ory/dockertest" "github.com/ory/dockertest/docker" "github.com/strangelove-ventures/horcrux/signer" + "github.com/strangelove-ventures/horcrux/signer/proto" tmjson "github.com/tendermint/tendermint/libs/json" "golang.org/x/sync/errgroup" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" ) -var ( - signerPort = "2222" - signerImage = "horcrux-test" +const ( + signerPort = "2222" + signerImage = "horcrux-test" + binary = "horcrux" + signerPortDocker = signerPort + "/tcp" ) // TestSigner represents a remote signer instance @@ -28,6 +35,7 @@ type TestSigner struct { Index int ValidatorIndex int Pool *dockertest.Pool + networkID string Container *docker.Container Key signer.CosignerKey tl TestLogger @@ -62,7 +70,6 @@ func StartSingleSignerContainers( testSigners TestSigners, validator *TestNode, sentryNodes TestNodes, - network *docker.Network, ) error { eg := new(errgroup.Group) ctx := context.Background() @@ -96,7 +103,7 @@ func StartSingleSignerContainers( for _, s := range testSigners { s := s eg.Go(func() error { - return s.CreateSingleSignerContainer(network.ID) + return s.CreateSingleSignerContainer() }) } if err := eg.Wait(); err != nil { @@ -123,7 +130,6 @@ func StartCosignerContainers( sentries TestNodes, threshold, total, sentriesPerSigner int, - network *docker.Network, ) error { eg := new(errgroup.Group) ctx := context.Background() @@ -201,7 +207,7 @@ func StartCosignerContainers( for _, s := range signers { s := s eg.Go(func() error { - return s.CreateCosignerContainer(network.ID) + return s.CreateCosignerContainer() }) } err = eg.Wait() @@ -233,13 +239,21 @@ func (ts TestSigners) PeerString(skip int) string { } // MakeTestSigners creates the TestSigner objects required for bootstrapping tests -func MakeTestSigners(validatorIndex, count int, home string, pool *dockertest.Pool, tl TestLogger) (out TestSigners) { +func MakeTestSigners( + validatorIndex int, + count int, + home string, + pool *dockertest.Pool, + networkID string, + tl TestLogger, +) (out TestSigners) { for i := 0; i < count; i++ { ts := &TestSigner{ Home: home, Index: i + 1, // +1 is to ensure all cosigner IDs end up being >0 as required in cosigner.go ValidatorIndex: validatorIndex, Pool: pool, + networkID: networkID, Container: nil, Key: signer.CosignerKey{}, tl: tl, @@ -253,7 +267,7 @@ func (ts *TestSigner) GetHosts() (out Hosts) { host := ContainerPort{ Name: ts.Name(), Container: ts.Container, - Port: docker.Port(fmt.Sprintf("%s/tcp", signerPort)), + Port: docker.Port(signerPortDocker), } out = append(out, host) return @@ -288,15 +302,17 @@ func (ts *TestSigner) Name() string { return fmt.Sprintf("val-%d-sgn-%d-%s", ts.ValidatorIndex, ts.Index, ts.tl.Name()) } -// InitSingleSignerConfig creates and runs a container to init a single signers config files -// blocks until the container exits -func (ts *TestSigner) InitSingleSignerConfig(ctx context.Context, listenNodes TestNodes) error { +// GRPCAddress returns the TCP address of the GRPC server, +// reachable from within the docker network. +func (ts *TestSigner) GRPCAddress() string { + return fmt.Sprintf("tcp://%s:%s", ts.Name(), signerPort) +} + +// ExecHorcruxCmd executes a CLI subcommand for the horcrux binary for the specific cosigner. +// The config home directory will be appended as a flag. +func (ts *TestSigner) ExecHorcruxCmd(ctx context.Context, cmd ...string) error { + cmd = ts.horcruxCmd(cmd) container := RandLowerCaseLetterString(10) - cmd := []string{ - "horcrux", "config", "init", - listenNodes[0].ChainID, listenNodes.ListenAddrs(), - fmt.Sprintf("--home=%s", ts.Dir()), - } ts.tl.Logf("{%s}[%s] -> '%s'", ts.Name(), container, strings.Join(cmd, " ")) cont, err := ts.Pool.Client.CreateContainer(docker.CreateContainerOptions{ Name: container, @@ -304,7 +320,7 @@ func (ts *TestSigner) InitSingleSignerConfig(ctx context.Context, listenNodes Te User: getDockerUserString(), Hostname: container, ExposedPorts: map[docker.Port]struct{}{ - docker.Port(fmt.Sprintf("%s/tcp", signerPort)): {}, + docker.Port(signerPortDocker): {}, }, Image: signerImage, Cmd: cmd, @@ -324,7 +340,9 @@ func (ts *TestSigner) InitSingleSignerConfig(ctx context.Context, listenNodes Te }, }, NetworkingConfig: &docker.NetworkingConfig{ - EndpointsConfig: map[string]*docker.EndpointConfig{}, + EndpointsConfig: map[string]*docker.EndpointConfig{ + ts.networkID: {}, + }, }, Context: nil, }) @@ -335,13 +353,13 @@ func (ts *TestSigner) InitSingleSignerConfig(ctx context.Context, listenNodes Te return err } exitCode, err := ts.Pool.Client.WaitContainerWithContext(cont.ID, ctx) - stdout := new(bytes.Buffer) - stderr := new(bytes.Buffer) + outputStream := new(bytes.Buffer) + errorStream := new(bytes.Buffer) _ = ts.Pool.Client.Logs(docker.LogsOptions{ Context: ctx, Container: cont.ID, - OutputStream: stdout, - ErrorStream: stderr, + OutputStream: outputStream, + ErrorStream: errorStream, Stdout: true, Stderr: true, Tail: "100", @@ -349,77 +367,31 @@ func (ts *TestSigner) InitSingleSignerConfig(ctx context.Context, listenNodes Te Timestamps: false, }) _ = ts.Pool.Client.RemoveContainer(docker.RemoveContainerOptions{ID: cont.ID}) - return handleNodeJobError(container, exitCode, stdout.String(), stderr.String(), err) + stdout := outputStream.String() + stderr := errorStream.String() + return containerExitError(container, exitCode, stdout, stderr, err) +} + +// InitSingleSignerConfig creates and runs a container to init a single signers config files +// blocks until the container exits +func (ts *TestSigner) InitSingleSignerConfig(ctx context.Context, listenNodes TestNodes) error { + return ts.ExecHorcruxCmd(ctx, + "config", "init", + listenNodes[0].ChainID, listenNodes.ListenAddrs()) } // InitCosignerConfig creates and runs a container to init a signer nodes config files // blocks until the container exits func (ts *TestSigner) InitCosignerConfig( ctx context.Context, listenNodes TestNodes, peers TestSigners, skip, threshold int) error { - container := RandLowerCaseLetterString(10) - cmd := []string{ - "horcrux", "config", "init", + return ts.ExecHorcruxCmd(ctx, + "config", "init", listenNodes[0].ChainID, listenNodes.ListenAddrs(), "--cosigner", fmt.Sprintf("--peers=%s", peers.PeerString(skip)), fmt.Sprintf("--threshold=%d", threshold), - fmt.Sprintf("--home=%s", ts.Dir()), - fmt.Sprintf("--listen=tcp://%s:%s", ts.Name(), signerPort), - } - ts.tl.Logf("{%s}[%s] -> '%s'", ts.Name(), container, strings.Join(cmd, " ")) - cont, err := ts.Pool.Client.CreateContainer(docker.CreateContainerOptions{ - Name: container, - Config: &docker.Config{ - User: getDockerUserString(), - Hostname: container, - ExposedPorts: map[docker.Port]struct{}{ - docker.Port(fmt.Sprintf("%s/tcp", signerPort)): {}, - }, - Image: signerImage, - Cmd: cmd, - Labels: map[string]string{"horcrux-test": ts.tl.Name()}, - }, - HostConfig: &docker.HostConfig{ - PublishAllPorts: true, - AutoRemove: false, - Mounts: []docker.HostMount{ - { - Type: "bind", - Source: ts.Home, - Target: ts.Home, - ReadOnly: false, - BindOptions: nil, - }, - }, - }, - NetworkingConfig: &docker.NetworkingConfig{ - EndpointsConfig: map[string]*docker.EndpointConfig{}, - }, - Context: nil, - }) - if err != nil { - return err - } - if err := ts.Pool.Client.StartContainer(cont.ID, nil); err != nil { - return err - } - - exitCode, err := ts.Pool.Client.WaitContainerWithContext(cont.ID, ctx) - stdout := new(bytes.Buffer) - stderr := new(bytes.Buffer) - _ = ts.Pool.Client.Logs(docker.LogsOptions{ - Context: ctx, - Container: cont.ID, - OutputStream: stdout, - ErrorStream: stderr, - Stdout: true, - Stderr: true, - Tail: "100", - Follow: false, - Timestamps: false, - }) - _ = ts.Pool.Client.RemoveContainer(docker.RemoveContainerOptions{ID: cont.ID}) - return handleNodeJobError(container, exitCode, stdout.String(), stderr.String(), err) + fmt.Sprintf("--listen=%s", ts.GRPCAddress()), + ) } // StartContainer starts a TestSigners container and assigns the new running container to replace the old one @@ -437,13 +409,11 @@ func (ts *TestSigner) StartContainer() error { return nil } -// StopContainer stops a TestSigners docker container -func (ts *TestSigner) StopContainer() error { - return ts.Pool.Client.StopContainer(ts.Container.ID, 60) -} - +// StopAndRemoveContainer stops and removes a TestSigners docker container. +// If force is true, error for stopping container will be ignored and container +// will be forcefully removed. func (ts *TestSigner) StopAndRemoveContainer(force bool) error { - if err := ts.StopContainer(); err != nil && !force { + if err := ts.Pool.Client.StopContainer(ts.Container.ID, 60); err != nil && !force { return err } return ts.Pool.Client.RemoveContainer(docker.RemoveContainerOptions{ @@ -461,15 +431,15 @@ func (ts *TestSigner) UnpauseContainer() error { } // CreateSingleSignerContainer creates a docker container to run a single signer -func (ts *TestSigner) CreateSingleSignerContainer(networkID string) error { +func (ts *TestSigner) CreateSingleSignerContainer() error { cont, err := ts.Pool.Client.CreateContainer(docker.CreateContainerOptions{ Name: ts.Name(), Config: &docker.Config{ User: getDockerUserString(), - Cmd: []string{"horcrux", "signer", "start", fmt.Sprintf("--home=%s", ts.Dir())}, + Cmd: []string{binary, "signer", "start", fmt.Sprintf("--home=%s", ts.Dir())}, Hostname: ts.Name(), ExposedPorts: map[docker.Port]struct{}{ - docker.Port(fmt.Sprintf("%s/tcp", signerPort)): {}, + docker.Port(signerPortDocker): {}, }, DNS: []string{}, Image: signerImage, @@ -490,7 +460,7 @@ func (ts *TestSigner) CreateSingleSignerContainer(networkID string) error { }, NetworkingConfig: &docker.NetworkingConfig{ EndpointsConfig: map[string]*docker.EndpointConfig{ - networkID: {}, + ts.networkID: {}, }, }, Context: nil, @@ -503,15 +473,15 @@ func (ts *TestSigner) CreateSingleSignerContainer(networkID string) error { } // CreateCosignerContainer creates a docker container to run a mpc validator node -func (ts *TestSigner) CreateCosignerContainer(networkID string) error { +func (ts *TestSigner) CreateCosignerContainer() error { cont, err := ts.Pool.Client.CreateContainer(docker.CreateContainerOptions{ Name: ts.Name(), Config: &docker.Config{ User: getDockerUserString(), - Cmd: []string{"horcrux", "cosigner", "start", fmt.Sprintf("--home=%s", ts.Dir())}, + Cmd: []string{binary, "cosigner", "start", fmt.Sprintf("--home=%s", ts.Dir())}, Hostname: ts.Name(), ExposedPorts: map[docker.Port]struct{}{ - docker.Port(fmt.Sprintf("%s/tcp", signerPort)): {}, + docker.Port(signerPortDocker): {}, }, DNS: []string{}, Image: signerImage, @@ -532,7 +502,7 @@ func (ts *TestSigner) CreateCosignerContainer(networkID string) error { }, NetworkingConfig: &docker.NetworkingConfig{ EndpointsConfig: map[string]*docker.EndpointConfig{ - networkID: {}, + ts.networkID: {}, }, }, Context: nil, @@ -543,3 +513,41 @@ func (ts *TestSigner) CreateCosignerContainer(networkID string) error { ts.Container = cont return nil } + +// TransferLeadership elects a new raft leader. +func (ts *TestSigner) TransferLeadership(ctx context.Context, newLeaderID int) error { + return ts.ExecHorcruxCmd(ctx, + "elect", strconv.FormatInt(int64(newLeaderID), 10), + ) +} + +// GetLeader returns the current raft leader. +func (ts *TestSigner) GetLeader(ctx context.Context) (string, error) { + grpcAddress := GetHostPort(ts.Container, signerPortDocker) + conn, err := grpc.Dial(grpcAddress, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions(grpc.WaitForReady(true)), + ) + if err != nil { + return "", fmt.Errorf("dialing failed: %w", err) + } + defer conn.Close() + + ctx, cancelFunc := context.WithTimeout(ctx, 10*time.Second) + defer cancelFunc() + + grpcClient := proto.NewCosignerGRPCClient(conn) + + res, err := grpcClient.GetLeader(ctx, &proto.CosignerGRPCGetLeaderRequest{}) + if err != nil { + return "", err + } + return res.GetLeader(), nil +} + +func (ts *TestSigner) horcruxCmd(cmd []string) (out []string) { + out = append(out, binary) + out = append(out, cmd...) + out = append(out, fmt.Sprintf("--home=%s", ts.Dir())) + return out +} diff --git a/test/test_validator.go b/test/test_validator.go index 99e465eb..d3015919 100644 --- a/test/test_validator.go +++ b/test/test_validator.go @@ -7,7 +7,6 @@ import ( "path/filepath" "github.com/ory/dockertest" - "github.com/ory/dockertest/docker" "github.com/strangelove-ventures/horcrux/signer" crypto "github.com/tendermint/tendermint/crypto" ed25519 "github.com/tendermint/tendermint/crypto/ed25519" @@ -28,6 +27,7 @@ type TestValidator struct { func NewHorcruxValidator( tl TestLogger, pool *dockertest.Pool, + networkID string, home string, chainID string, index int, @@ -38,8 +38,8 @@ func NewHorcruxValidator( ) (*TestValidator, error) { testValidator := &TestValidator{ Index: index, - Sentries: MakeTestNodes(index, numSentries, home, chainID, chainType, pool, tl), - Signers: MakeTestSigners(index, numSigners, home, pool, tl), + Sentries: MakeTestNodes(index, numSentries, home, chainID, chainType, pool, networkID, tl), + Signers: MakeTestSigners(index, numSigners, home, pool, networkID, tl), tl: tl, Home: home, Threshold: threshold, @@ -53,6 +53,7 @@ func NewHorcruxValidator( func NewHorcruxValidatorWithPrivValKey( tl TestLogger, pool *dockertest.Pool, + networkID string, home string, chainID string, index int, @@ -64,8 +65,8 @@ func NewHorcruxValidatorWithPrivValKey( ) (*TestValidator, error) { testValidator := &TestValidator{ Index: index, - Sentries: MakeTestNodes(index, numSentries, home, chainID, chainType, pool, tl), - Signers: MakeTestSigners(index, numSigners, home, pool, tl), + Sentries: MakeTestNodes(index, numSentries, home, chainID, chainType, pool, networkID, tl), + Signers: MakeTestSigners(index, numSigners, home, pool, networkID, tl), tl: tl, Home: home, Threshold: threshold, @@ -120,11 +121,10 @@ func (tv *TestValidator) generateShares(filePVKey privval.FilePVKey) error { func (tv *TestValidator) StartHorcruxCluster( ctx context.Context, - network *docker.Network, sentriesPerSigner int, ) error { return StartCosignerContainers(tv.Signers, tv.Sentries, - tv.Threshold, len(tv.Signers), sentriesPerSigner, network) + tv.Threshold, len(tv.Signers), sentriesPerSigner) } func (tv *TestValidator) WaitForConsecutiveBlocks(blocks int64) error {