Skip to content

Commit

Permalink
Implement a simple healthcheck (#2740)
Browse files Browse the repository at this point in the history
  • Loading branch information
mandrigin authored Sep 28, 2021
1 parent d027d71 commit 9848028
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 0 deletions.
39 changes: 39 additions & 0 deletions cmd/rpcdaemon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- [Getting Started](#getting-started)
* [Running locally](#running-locally)
* [Running remotely](#running-remotely)
* [Healthcheck](#healthcheck)
* [Testing](#testing)
- [FAQ](#faq)
* [Relations between prune options and rpc methods](#relations-between-prune-options-and-rpc-method)
Expand Down Expand Up @@ -63,6 +64,44 @@ The daemon should respond with something like:
INFO [date-time] HTTP endpoint opened url=localhost:8545...
```

### Healthcheck

Running the daemon also opens an endpoint `/health` that provides a basic
health check.

If the health check is successful it returns 200 OK.

If the health check fails it returns 500 Internal Server Error.

Configuration of the health check is sent as POST body of the method.

```
{
"min_peer_count": <minimal number of the node peers>,
"known_block": <number_of_block_that_node_should_know>
}
```

Not adding a check disables that.

**`min_peer_count`** -- checks for mimimum of healthy node peers. Requires
`net` namespace to be listed in `http.api`.

**`known_block`** -- sets up the block that node has to know about. Requires
`eth` namespace to be listed in `http.api`.

Example request
```http POST http://localhost:8545/health --raw '{"min_peer_count": 3, "known_block": "0x1F"}'```
Example response
```
{
"check_block": "HEALTHY",
"healthcheck_query": "HEALTHY",
"min_peer_count": "HEALTHY"
}
```


### Testing

By default, the `rpcdaemon` serves data from `localhost:8545`. You may send `curl` commands to see if things are
Expand Down
5 changes: 5 additions & 0 deletions cmd/rpcdaemon/cli/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
kv2 "github.com/ledgerwatch/erigon-lib/kv/mdbx"
"github.com/ledgerwatch/erigon-lib/kv/remotedb"
"github.com/ledgerwatch/erigon-lib/kv/remotedbserver"
"github.com/ledgerwatch/erigon/cmd/rpcdaemon/health"
"github.com/ledgerwatch/erigon/cmd/rpcdaemon/services"
"github.com/ledgerwatch/erigon/cmd/utils"
"github.com/ledgerwatch/erigon/common/paths"
Expand Down Expand Up @@ -252,6 +253,10 @@ func StartRpcServer(ctx context.Context, cfg Flags, rpcAPI []rpc.API) error {
}

var handler http.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// adding a healthcheck here
if health.ProcessHealthcheckIfNeeded(w, r, rpcAPI) {
return
}
if cfg.WebsocketEnabled && r.Method == "GET" {
wsHandler.ServeHTTP(w, r)
return
Expand Down
23 changes: 23 additions & 0 deletions cmd/rpcdaemon/health/check_block.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package health

import (
"context"
"fmt"

"github.com/ledgerwatch/erigon/rpc"
)

func checkBlockNumber(blockNumber rpc.BlockNumber, api EthAPI) error {
if api == nil {
return fmt.Errorf("no connection to the Erigon server or `eth` namespace isn't enabled")
}
data, err := api.GetBlockByNumber(context.TODO(), blockNumber, false)
if err != nil {
return err
}
if len(data) == 0 { // block not found
return fmt.Errorf("no known block with number %v (%x hex)", blockNumber, blockNumber)
}

return nil
}
23 changes: 23 additions & 0 deletions cmd/rpcdaemon/health/check_peers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package health

import (
"context"
"fmt"
)

func checkMinPeers(minPeerCount uint, api NetAPI) error {
if api == nil {
return fmt.Errorf("no connection to the Erigon server or `net` namespace isn't enabled")
}

peerCount, err := api.PeerCount(context.TODO())
if err != nil {
return err
}

if uint64(peerCount) < uint64(minPeerCount) {
return fmt.Errorf("not enough peers: %d (minimum %d))", peerCount, minPeerCount)
}

return nil
}
132 changes: 132 additions & 0 deletions cmd/rpcdaemon/health/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package health

import (
"encoding/json"
"errors"
"fmt"
"io"
"io/ioutil"
"net/http"
"strings"

"github.com/ledgerwatch/erigon/rpc"
"github.com/ledgerwatch/log/v3"
)

type requestBody struct {
MinPeerCount *uint `json:"min_peer_count"`
BlockNumber *rpc.BlockNumber `json:"known_block"`
}

const (
urlPath = "/health"
)

var (
errCheckDisabled = errors.New("error check disabled")
)

func ProcessHealthcheckIfNeeded(
w http.ResponseWriter,
r *http.Request,
rpcAPI []rpc.API,
) bool {
if !strings.EqualFold(r.URL.Path, urlPath) {
return false
}

netAPI, ethAPI := parseAPI(rpcAPI)

var errMinPeerCount = errCheckDisabled
var errCheckBlock = errCheckDisabled

body, errParse := parseHealthCheckBody(r.Body)
defer r.Body.Close()

if errParse != nil {
log.Root().Warn("unable to process healthcheck request", "error", errParse)
} else {
// 1. net_peerCount
if body.MinPeerCount != nil {
errMinPeerCount = checkMinPeers(*body.MinPeerCount, netAPI)
}
// 2. custom query (shouldn't fail)
if body.BlockNumber != nil {
errCheckBlock = checkBlockNumber(*body.BlockNumber, ethAPI)
}
// TODO add time from the last sync cycle
}

err := reportHealth(errParse, errMinPeerCount, errCheckBlock, w)
if err != nil {
log.Root().Warn("unable to process healthcheck request", "error", err)
}

return true
}

func parseHealthCheckBody(reader io.Reader) (requestBody, error) {
var body requestBody

bodyBytes, err := ioutil.ReadAll(reader)
if err != nil {
return body, err
}

err = json.Unmarshal(bodyBytes, &body)
if err != nil {
return body, err
}

return body, nil
}

func reportHealth(errParse, errMinPeerCount, errCheckBlock error, w http.ResponseWriter) error {
statusCode := http.StatusOK
errors := make(map[string]string)

if shouldChangeStatusCode(errParse) {
statusCode = http.StatusInternalServerError
}
errors["healthcheck_query"] = errorStringOrOK(errParse)

if shouldChangeStatusCode(errMinPeerCount) {
statusCode = http.StatusInternalServerError
}
errors["min_peer_count"] = errorStringOrOK(errMinPeerCount)

if shouldChangeStatusCode(errCheckBlock) {
statusCode = http.StatusInternalServerError
}
errors["check_block"] = errorStringOrOK(errCheckBlock)

w.WriteHeader(statusCode)

bodyJson, err := json.Marshal(errors)
if err != nil {
return err
}

_, err = w.Write(bodyJson)
if err != nil {
return err
}

return nil
}

func shouldChangeStatusCode(err error) bool {
return err != nil && !errors.Is(err, errCheckDisabled)
}

func errorStringOrOK(err error) string {
if err == nil {
return "HEALTHY"
}

if errors.Is(err, errCheckDisabled) {
return "DISABLED"
}

return fmt.Sprintf("ERROR: %v", err)
}
16 changes: 16 additions & 0 deletions cmd/rpcdaemon/health/interfaces.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package health

import (
"context"

"github.com/ledgerwatch/erigon/common/hexutil"
"github.com/ledgerwatch/erigon/rpc"
)

type NetAPI interface {
PeerCount(_ context.Context) (hexutil.Uint, error)
}

type EthAPI interface {
GetBlockByNumber(_ context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error)
}
22 changes: 22 additions & 0 deletions cmd/rpcdaemon/health/parse_api.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package health

import (
"github.com/ledgerwatch/erigon/rpc"
)

func parseAPI(api []rpc.API) (netAPI NetAPI, ethAPI EthAPI) {
for _, rpc := range api {
if rpc.Service == nil {
continue
}

if netCandidate, ok := rpc.Service.(NetAPI); ok {
netAPI = netCandidate
}

if ethCandidate, ok := rpc.Service.(EthAPI); ok {
ethAPI = ethCandidate
}
}
return netAPI, ethAPI
}

0 comments on commit 9848028

Please sign in to comment.