diff --git a/cmd/rpcdaemon/README.md b/cmd/rpcdaemon/README.md index 31c0dbb5cbd..45e6a2dce06 100644 --- a/cmd/rpcdaemon/README.md +++ b/cmd/rpcdaemon/README.md @@ -2,6 +2,7 @@ - [Getting Started](#getting-started) * [Running locally](#running-locally) * [Running remotely](#running-remotely) + * [Healthcheck](#healthcheck) * [Testing](#testing) - [FAQ](#faq) * [Relations between prune options and rpc methods](#relations-between-prune-options-and-rpc-method) @@ -63,6 +64,44 @@ The daemon should respond with something like: INFO [date-time] HTTP endpoint opened url=localhost:8545... ``` +### Healthcheck + +Running the daemon also opens an endpoint `/health` that provides a basic +health check. + +If the health check is successful it returns 200 OK. + +If the health check fails it returns 500 Internal Server Error. + +Configuration of the health check is sent as POST body of the method. + +``` +{ + "min_peer_count": , + "known_block": +} +``` + +Not adding a check disables that. + +**`min_peer_count`** -- checks for mimimum of healthy node peers. Requires +`net` namespace to be listed in `http.api`. + +**`known_block`** -- sets up the block that node has to know about. Requires +`eth` namespace to be listed in `http.api`. + +Example request +```http POST http://localhost:8545/health --raw '{"min_peer_count": 3, "known_block": "0x1F"}'``` +Example response +``` +{ + "check_block": "HEALTHY", + "healthcheck_query": "HEALTHY", + "min_peer_count": "HEALTHY" +} +``` + + ### Testing By default, the `rpcdaemon` serves data from `localhost:8545`. You may send `curl` commands to see if things are diff --git a/cmd/rpcdaemon/cli/config.go b/cmd/rpcdaemon/cli/config.go index f7cd6655f88..e3433dd28cd 100644 --- a/cmd/rpcdaemon/cli/config.go +++ b/cmd/rpcdaemon/cli/config.go @@ -15,6 +15,7 @@ import ( kv2 "github.com/ledgerwatch/erigon-lib/kv/mdbx" "github.com/ledgerwatch/erigon-lib/kv/remotedb" "github.com/ledgerwatch/erigon-lib/kv/remotedbserver" + "github.com/ledgerwatch/erigon/cmd/rpcdaemon/health" "github.com/ledgerwatch/erigon/cmd/rpcdaemon/services" "github.com/ledgerwatch/erigon/cmd/utils" "github.com/ledgerwatch/erigon/common/paths" @@ -252,6 +253,10 @@ func StartRpcServer(ctx context.Context, cfg Flags, rpcAPI []rpc.API) error { } var handler http.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // adding a healthcheck here + if health.ProcessHealthcheckIfNeeded(w, r, rpcAPI) { + return + } if cfg.WebsocketEnabled && r.Method == "GET" { wsHandler.ServeHTTP(w, r) return diff --git a/cmd/rpcdaemon/health/check_block.go b/cmd/rpcdaemon/health/check_block.go new file mode 100644 index 00000000000..8978b6ffc4e --- /dev/null +++ b/cmd/rpcdaemon/health/check_block.go @@ -0,0 +1,23 @@ +package health + +import ( + "context" + "fmt" + + "github.com/ledgerwatch/erigon/rpc" +) + +func checkBlockNumber(blockNumber rpc.BlockNumber, api EthAPI) error { + if api == nil { + return fmt.Errorf("no connection to the Erigon server or `eth` namespace isn't enabled") + } + data, err := api.GetBlockByNumber(context.TODO(), blockNumber, false) + if err != nil { + return err + } + if len(data) == 0 { // block not found + return fmt.Errorf("no known block with number %v (%x hex)", blockNumber, blockNumber) + } + + return nil +} diff --git a/cmd/rpcdaemon/health/check_peers.go b/cmd/rpcdaemon/health/check_peers.go new file mode 100644 index 00000000000..818152b668b --- /dev/null +++ b/cmd/rpcdaemon/health/check_peers.go @@ -0,0 +1,23 @@ +package health + +import ( + "context" + "fmt" +) + +func checkMinPeers(minPeerCount uint, api NetAPI) error { + if api == nil { + return fmt.Errorf("no connection to the Erigon server or `net` namespace isn't enabled") + } + + peerCount, err := api.PeerCount(context.TODO()) + if err != nil { + return err + } + + if uint64(peerCount) < uint64(minPeerCount) { + return fmt.Errorf("not enough peers: %d (minimum %d))", peerCount, minPeerCount) + } + + return nil +} diff --git a/cmd/rpcdaemon/health/health.go b/cmd/rpcdaemon/health/health.go new file mode 100644 index 00000000000..69236d47cd5 --- /dev/null +++ b/cmd/rpcdaemon/health/health.go @@ -0,0 +1,132 @@ +package health + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "net/http" + "strings" + + "github.com/ledgerwatch/erigon/rpc" + "github.com/ledgerwatch/log/v3" +) + +type requestBody struct { + MinPeerCount *uint `json:"min_peer_count"` + BlockNumber *rpc.BlockNumber `json:"known_block"` +} + +const ( + urlPath = "/health" +) + +var ( + errCheckDisabled = errors.New("error check disabled") +) + +func ProcessHealthcheckIfNeeded( + w http.ResponseWriter, + r *http.Request, + rpcAPI []rpc.API, +) bool { + if !strings.EqualFold(r.URL.Path, urlPath) { + return false + } + + netAPI, ethAPI := parseAPI(rpcAPI) + + var errMinPeerCount = errCheckDisabled + var errCheckBlock = errCheckDisabled + + body, errParse := parseHealthCheckBody(r.Body) + defer r.Body.Close() + + if errParse != nil { + log.Root().Warn("unable to process healthcheck request", "error", errParse) + } else { + // 1. net_peerCount + if body.MinPeerCount != nil { + errMinPeerCount = checkMinPeers(*body.MinPeerCount, netAPI) + } + // 2. custom query (shouldn't fail) + if body.BlockNumber != nil { + errCheckBlock = checkBlockNumber(*body.BlockNumber, ethAPI) + } + // TODO add time from the last sync cycle + } + + err := reportHealth(errParse, errMinPeerCount, errCheckBlock, w) + if err != nil { + log.Root().Warn("unable to process healthcheck request", "error", err) + } + + return true +} + +func parseHealthCheckBody(reader io.Reader) (requestBody, error) { + var body requestBody + + bodyBytes, err := ioutil.ReadAll(reader) + if err != nil { + return body, err + } + + err = json.Unmarshal(bodyBytes, &body) + if err != nil { + return body, err + } + + return body, nil +} + +func reportHealth(errParse, errMinPeerCount, errCheckBlock error, w http.ResponseWriter) error { + statusCode := http.StatusOK + errors := make(map[string]string) + + if shouldChangeStatusCode(errParse) { + statusCode = http.StatusInternalServerError + } + errors["healthcheck_query"] = errorStringOrOK(errParse) + + if shouldChangeStatusCode(errMinPeerCount) { + statusCode = http.StatusInternalServerError + } + errors["min_peer_count"] = errorStringOrOK(errMinPeerCount) + + if shouldChangeStatusCode(errCheckBlock) { + statusCode = http.StatusInternalServerError + } + errors["check_block"] = errorStringOrOK(errCheckBlock) + + w.WriteHeader(statusCode) + + bodyJson, err := json.Marshal(errors) + if err != nil { + return err + } + + _, err = w.Write(bodyJson) + if err != nil { + return err + } + + return nil +} + +func shouldChangeStatusCode(err error) bool { + return err != nil && !errors.Is(err, errCheckDisabled) +} + +func errorStringOrOK(err error) string { + if err == nil { + return "HEALTHY" + } + + if errors.Is(err, errCheckDisabled) { + return "DISABLED" + } + + return fmt.Sprintf("ERROR: %v", err) +} diff --git a/cmd/rpcdaemon/health/interfaces.go b/cmd/rpcdaemon/health/interfaces.go new file mode 100644 index 00000000000..4cf0fc6892b --- /dev/null +++ b/cmd/rpcdaemon/health/interfaces.go @@ -0,0 +1,16 @@ +package health + +import ( + "context" + + "github.com/ledgerwatch/erigon/common/hexutil" + "github.com/ledgerwatch/erigon/rpc" +) + +type NetAPI interface { + PeerCount(_ context.Context) (hexutil.Uint, error) +} + +type EthAPI interface { + GetBlockByNumber(_ context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error) +} diff --git a/cmd/rpcdaemon/health/parse_api.go b/cmd/rpcdaemon/health/parse_api.go new file mode 100644 index 00000000000..21e003e5a59 --- /dev/null +++ b/cmd/rpcdaemon/health/parse_api.go @@ -0,0 +1,22 @@ +package health + +import ( + "github.com/ledgerwatch/erigon/rpc" +) + +func parseAPI(api []rpc.API) (netAPI NetAPI, ethAPI EthAPI) { + for _, rpc := range api { + if rpc.Service == nil { + continue + } + + if netCandidate, ok := rpc.Service.(NetAPI); ok { + netAPI = netCandidate + } + + if ethCandidate, ok := rpc.Service.(EthAPI); ok { + ethAPI = ethCandidate + } + } + return netAPI, ethAPI +}