From e441b45d717e26b6f447c55b33110c9e39e50371 Mon Sep 17 00:00:00 2001 From: mDuo13 Date: Mon, 29 Jun 2020 17:10:44 -0700 Subject: [PATCH 1/5] Health Check method: set up IA --- .../the-rippled-server/peer-protocol.md | 2 +- .../peer-port-methods/health-check.md | 5 +++ .../{ => peer-port-methods}/peer-crawler.md | 2 +- .../{ => peer-port-methods}/validator-list.md | 0 dactyl-config.yml | 37 ++++++++++++++++--- 5 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 content/references/rippled-api/peer-port-methods/health-check.md rename content/references/rippled-api/{ => peer-port-methods}/peer-crawler.md (93%) rename content/references/rippled-api/{ => peer-port-methods}/validator-list.md (100%) diff --git a/content/concepts/the-rippled-server/peer-protocol.md b/content/concepts/the-rippled-server/peer-protocol.md index f61e95ce8cc..6a89c2b734a 100644 --- a/content/concepts/the-rippled-server/peer-protocol.md +++ b/content/concepts/the-rippled-server/peer-protocol.md @@ -39,7 +39,7 @@ ip = 0.0.0.0 protocol = peer ``` -The peer protocol port also serves the [special Peer Crawler API method](peer-crawler.html). +The peer protocol port also serves [special peer port methods](peer-port-methods.html). ## Node Key Pair diff --git a/content/references/rippled-api/peer-port-methods/health-check.md b/content/references/rippled-api/peer-port-methods/health-check.md new file mode 100644 index 00000000000..15bea70139b --- /dev/null +++ b/content/references/rippled-api/peer-port-methods/health-check.md @@ -0,0 +1,5 @@ +# Health Check + +The Health Check is a special [peer port method](peer-port-methods.html) for reporting on the health of an individual `rippled` server. + +***TODO: detailed description. PR: *** diff --git a/content/references/rippled-api/peer-crawler.md b/content/references/rippled-api/peer-port-methods/peer-crawler.md similarity index 93% rename from content/references/rippled-api/peer-crawler.md rename to content/references/rippled-api/peer-port-methods/peer-crawler.md index d81fb93ab60..36093f41dd7 100644 --- a/content/references/rippled-api/peer-crawler.md +++ b/content/references/rippled-api/peer-port-methods/peer-crawler.md @@ -1,6 +1,6 @@ # Peer Crawler -The Peer Crawler is a special API endpoint for reporting on the health and topology of the peer-to-peer network. This API method is available by default on a non-privileged basis through the [Peer Protocol](peer-protocol.html) port, which is also used for `rippled` servers' peer-to-peer communications about consensus, ledger history, and other necessary information. +The Peer Crawler is a special [peer port method](peer-port-methods.html) for reporting on the health and topology of the peer-to-peer network. This API method is available by default on a non-privileged basis through the [Peer Protocol](peer-protocol.html) port, which is also used for `rippled` servers' peer-to-peer communications about consensus, ledger history, and other necessary information. The information reported by the peer crawler is effectively public, and can be used to report on the overall XRP Ledger network, its health, and topology. diff --git a/content/references/rippled-api/validator-list.md b/content/references/rippled-api/peer-port-methods/validator-list.md similarity index 100% rename from content/references/rippled-api/validator-list.md rename to content/references/rippled-api/peer-port-methods/validator-list.md diff --git a/dactyl-config.yml b/dactyl-config.yml index 69c62815d3d..0011973f0e5 100644 --- a/dactyl-config.yml +++ b/dactyl-config.yml @@ -5868,35 +5868,60 @@ pages: targets: - ja - - md: references/rippled-api/peer-crawler.md + # TODO: translate title & blurb + - name: Peer Port Methods + html: peer-port-methods.html + funnel: Docs + doc_type: References + supercategory: rippled API + category: Peer Port Methods + template: template-landing-children.html + blurb: Special API method for sharing network topology and status metrics. + targets: + - en + - ja + + # TODO: translate page & blurb + - md: references/rippled-api/peer-port-methods/health-check.md + html: health-check.html + funnel: Docs + doc_type: References + supercategory: rippled API + category: Peer Port Methods + blurb: Special API method for reporting server health. + targets: + - en + - ja + + - md: references/rippled-api/peer-port-methods/peer-crawler.md html: peer-crawler.html funnel: Docs doc_type: References supercategory: rippled API - category: Peer Crawler + category: Peer Port Methods blurb: Special API method for sharing network topology and status metrics. targets: - en # TODO: translate page - - md: references/rippled-api/peer-crawler.md + - md: references/rippled-api/peer-port-methods/peer-crawler.md html: peer-crawler.html funnel: Docs doc_type: References supercategory: rippled API - category: Peer Crawler + category: Peer Port Methods blurb: ネットワークトポロジーとステータスメトリックを共有するための特殊なAPIメソッドです。 untranslated_warning: true targets: - ja # TODO: translate page & blurb - - md: references/rippled-api/validator-list.md + - md: references/rippled-api/peer-port-methods/validator-list.md html: validator-list.html funnel: Docs doc_type: References supercategory: rippled API - category: Validator List + category: Peer Port Methods blurb: Special API method for sharing recommended validator lists. targets: - en From 0f807c6e09b51f56f553ef461249e698c9d04d15 Mon Sep 17 00:00:00 2001 From: mDuo13 Date: Wed, 1 Jul 2020 20:43:08 -0700 Subject: [PATCH 2/5] Health check details (+related edits) --- .../amendments/amendments.md | 2 + .../peer-port-methods/health-check.md | 65 ++++++++++- ...onnect-your-rippled-to-the-xrp-test-net.md | 4 +- .../installation/build-run-rippled-macos.md | 12 +- .../installation/capacity-planning.md | 8 +- ...install-rippled-on-centos-rhel-with-yum.md | 37 +++++++ .../health-check-interventions.md | 104 ++++++++++++++++++ dactyl-config.yml | 12 ++ 8 files changed, 233 insertions(+), 11 deletions(-) create mode 100644 content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md diff --git a/content/concepts/consensus-network/amendments/amendments.md b/content/concepts/consensus-network/amendments/amendments.md index de5008f0e09..528eb9dbb87 100644 --- a/content/concepts/consensus-network/amendments/amendments.md +++ b/content/concepts/consensus-network/amendments/amendments.md @@ -92,6 +92,8 @@ The amendments that a `rippled` server is configured to vote for or against have If your server is amendment blocked, you must [upgrade to a new version](install-rippled.html) to sync with the network. +It is also possible to be amendment blocked because you connected your server to a [parallel network](parallel-networks.html) that has different amendments enabled. For example, the XRP Ledger Devnet typically has upcoming and experimental amendments enabled. If you are using the latest production release, your server is likely to be amendment blocked when connecting to Devnet. You could resolve this issue by upgrading to an unstable pre-release or nightly build, or you could [connect to a different network such as Testnet](connect-your-rippled-to-the-xrp-test-net.html) instead. + #### How to Tell If Your `rippled` Server Is Amendment Blocked diff --git a/content/references/rippled-api/peer-port-methods/health-check.md b/content/references/rippled-api/peer-port-methods/health-check.md index 15bea70139b..268834aacf5 100644 --- a/content/references/rippled-api/peer-port-methods/health-check.md +++ b/content/references/rippled-api/peer-port-methods/health-check.md @@ -1,5 +1,66 @@ # Health Check +[[Source]](https://github.com/ripple/rippled/blob/de0c52738785de8bf837f9124da65c7905e7bb5a/src/ripple/overlay/impl/OverlayImpl.cpp#L1084-L1168 "Source") -The Health Check is a special [peer port method](peer-port-methods.html) for reporting on the health of an individual `rippled` server. +The Health Check is a special [peer port method](peer-port-methods.html) for reporting on the health of an individual `rippled` server. This method is intended for use in automated monitoring to recognize outages and prompt automated or manual interventions such as restarting the server. [New in: rippled 1.6.0][] -***TODO: detailed description. PR: *** +This method checks several metrics to see if they are in ranges generally considered healthy. If all metrics are in normal ranges, this method reports that the server is healthy. If any metric is outside normal ranges, this method reports that the server is unhealthy and reports the metric(s) that were unhealthy. Since some metrics may rapidly fluctuate into and out of unhealthy ranges, it is recommended not to raise alerts unless the health check fails multiple times in a row. + +**Note:** Since the health check is a [peer port method](peer-port-methods.html), it is not available when testing the server in [stand-alone mode](rippled-server-modes.html#reasons-to-run-a-rippled-server-in-stand-alone-mode). + + +## Request Format + +To request the Peer Crawler information, make the following HTTP request: + +- **Protocol:** https +- **HTTP Method:** GET +- **Host:** (any `rippled` server, by hostname or IP address) +- **Port:** (the port number where the `rippled` server uses the Peer Protocol, typically 51235) +- **Path:** `/health` +- **Security:** Most `rippled` servers use a self-signed certificate to respond to the request. By default, most tools (including web browsers) flag or block such responses for being untrusted. You must ignore the certificate checking (for example, if using cURL, add the `--insecure` flag) to display a response from those servers. + + + +## Example Response + +```json +HTTP/1.1 503 Service Unavailable +Server: rippled-1.6.0 +Content-Type: application/json +Connection: close +Transfer-Encoding: chunked + +{ + "info": { + "load_factor": 256, + "server_state": "connected", + "validated_ledger": 2147483647 + } +} +``` + +## Response Format + +If the server is in a **critical** state, the response has the status code **503 Service Unavailable**. If the server is **healthy** or in a **warning** state, the response has the status code **200 OK**. + +In either case, the response body is a JSON object with a single `info` object at the top level. The info object contains values for each metric that is in a warning or critical range. The response omits metrics that are in a healthy range, so a fully healthy server has an empty object. + +The `info` object may contain the following fields: + +| `Field` | Value | Description | +|:--------------------|:--------|:---------------------------------------------| +| `amendment_blocked` | Boolean | _(May be omitted)_ If `true`, the server is [amemdment blocked](amendments.html#amendment-blocked) and must be upgraded to remain synced with the network; this state is critical. If the server is not amendment blocked, this field is omitted. | +| `load_factor` | Number | _(May be omitted)_ A measure of the overall load the server is under. This reflects I/O, CPU, and memory limitations. This is a warning if the load factor is over 100, or critical if the load factor is 1000 or higher. | +| `peers` | Number | _(May be omitted)_ The number of [peer servers](peer-protocol.html) this server is connected to. This is a warning if connected to 7 or fewer peers, and critical if connected to zero peers. | +| `server_state` | String | _(May be omitted)_ The current [server state](rippled-server-states.html). This is a warning if the server is in the `tracking`, `syncing`, or `connected` states. This is critical if the server is in the `disconnected` state. | +| `validated_ledger` | Number | _(May be omitted)_ The number of seconds since the last time a ledger was validated by consensus. If there is no validated ledger available, this is a very large integer value such as `2147483647` (architecture-dependent). This is a warning if the last validated ledger was at least 7 seconds ago, and critical if the last validated ledger was at least 20 seconds ago. | + +## See Also + +For guidance interpreting the results of the health check, see [Health Check Interventions](health-check-interventions.html). + + + +{% include '_snippets/rippled-api-links.md' %} +{% include '_snippets/tx-type-links.md' %} +{% include '_snippets/rippled_versions.md' %} diff --git a/content/tutorials/manage-the-rippled-server/configuration/connect-your-rippled-to-the-xrp-test-net.md b/content/tutorials/manage-the-rippled-server/configuration/connect-your-rippled-to-the-xrp-test-net.md index 413de1ecee0..fccad80b782 100644 --- a/content/tutorials/manage-the-rippled-server/configuration/connect-your-rippled-to-the-xrp-test-net.md +++ b/content/tutorials/manage-the-rippled-server/configuration/connect-your-rippled-to-the-xrp-test-net.md @@ -1,8 +1,8 @@ -# Connect Your rippled to an XRPL Altnet +# Connect Your rippled to a Parallel Network Ripple has created [alternative test and development networks](parallel-networks.html) for developers to test their apps on the latest non-production version of the XRP Ledger (Testnet) or to test and experiment with features on the latest beta version (Devnet). **The funds used on these networks are not real funds and are intended for testing only.** You can connect your [`rippled` server](the-rippled-server.html) to either the Testnet or Devnet. -**Note:** The XRP Testnet and Devnet ledger and balances are reset on a regular basis. +**Caution:** The Devnet frequently has new and experimental [amendments](amendments.html) enabled, so the latest production release version is likely to be amendment blocked when connecting to Devnet. You should use a pre-release or nightly build when connecting to Devnet. To connect your `rippled` server to the XRP Testnet or Devnet, set the following configurations: diff --git a/content/tutorials/manage-the-rippled-server/installation/build-run-rippled-macos.md b/content/tutorials/manage-the-rippled-server/installation/build-run-rippled-macos.md index f2a315e2b6c..ea86e56aa8e 100644 --- a/content/tutorials/manage-the-rippled-server/installation/build-run-rippled-macos.md +++ b/content/tutorials/manage-the-rippled-server/installation/build-run-rippled-macos.md @@ -53,17 +53,21 @@ For development purposes, run `rippled` as a non-admin user, not using `sudo`. $ git clone git@github.com:ripple/rippled.git $ cd rippled -0. By default, cloning puts you on the `develop` branch. Use this branch if you are doing development work and want the latest set of untested features. +0. Switch to the appropriate branch for the software version you want: - If you want the latest stable release, checkout the `master` branch. + For the latest stable release, use the `master` branch. $ git checkout master - If you want to test out the latest release candidate, checkout the `release` branch: + For the latest release candidate, use the `release` branch: $ git checkout release - Or, you can checkout one of the tagged releases listed on [GitHub](https://github.com/ripple/rippled/releases). + For the latest in-progress version, use the `develop` branch: + + $ git checkout develop + + Or, you can checkout one of the tagged releases listed on [GitHub](https://github.com/ripple/rippled/releases). 0. In the `rippled` directory you just cloned, create your build directory and access it. For example: diff --git a/content/tutorials/manage-the-rippled-server/installation/capacity-planning.md b/content/tutorials/manage-the-rippled-server/installation/capacity-planning.md index a7684f72509..5252c636ce0 100644 --- a/content/tutorials/manage-the-rippled-server/installation/capacity-planning.md +++ b/content/tutorials/manage-the-rippled-server/installation/capacity-planning.md @@ -177,17 +177,19 @@ Memory requirements are mainly a function of the `node_size` configuration setti #### Network -Any enterprise or carrier-class data center should have substantial network bandwidth to support running `rippled` servers. +Any enterprise or carrier-class data center should have substantial network bandwidth to support running `rippled` servers. The actual bandwidth necessary varies significantly based on the current transaction volume in the network. Server behavior (such as backfilling [ledger history](ledger-history.html)) also affects network use. + +During exceptionally high periods of transaction volume, some operators have reported that their `rippled` servers have completely saturated a 100MBit/s network link. Therefore, a gigabit network interface is required for reliable performance. Here are examples of observed network bandwidth use for common `rippled` tasks: | Task | Transmit/Receive | |:------------------------------------------------|:---------------------------| -| Process current transaction volumes | 2Mbps transmit, 2 Mbps receive | +| Process average transaction volumes | 2Mbps transmit, 2 Mbps receive | +| Process peak transaction volumes | >100Mbps transmit | | Serve historical ledger and transaction reports | 100Mbps transmit | | Start up `rippled` | 20Mbps receive | - ## See Also - **Concepts:** diff --git a/content/tutorials/manage-the-rippled-server/installation/install-rippled-on-centos-rhel-with-yum.md b/content/tutorials/manage-the-rippled-server/installation/install-rippled-on-centos-rhel-with-yum.md index caa14c11017..dcc33aa187c 100644 --- a/content/tutorials/manage-the-rippled-server/installation/install-rippled-on-centos-rhel-with-yum.md +++ b/content/tutorials/manage-the-rippled-server/installation/install-rippled-on-centos-rhel-with-yum.md @@ -14,6 +14,16 @@ Before you install `rippled`, you must meet the [System Requirements](system-req 1. Install the Ripple RPM repository: + Choose the appropriate RPM repository for the stability of releases you want: + + - `stable` for the latest production release (`master` branch) + - `unstable` for pre-release builds (`release` branch) + - `nightly` for experimental/development builds (`develop` branch) + + + + *Stable* + $ cat << REPOFILE | sudo tee /etc/yum.repos.d/ripple.repo [ripple-stable] name=XRP Ledger Packages @@ -24,6 +34,33 @@ Before you install `rippled`, you must meet the [System Requirements](system-req repo_gpgcheck=1 REPOFILE + *Pre-release* + + $ cat << REPOFILE | sudo tee /etc/yum.repos.d/ripple.repo + [ripple-unstable] + name=XRP Ledger Packages + baseurl=https://repos.ripple.com/repos/rippled-rpm/unstable/ + enabled=1 + gpgcheck=0 + gpgkey=https://repos.ripple.com/repos/rippled-rpm/unstable/repodata/repomd.xml.key + repo_gpgcheck=1 + REPOFILE + + *Development* + + $ cat << REPOFILE | sudo tee /etc/yum.repos.d/ripple.repo + [ripple-nightly] + name=XRP Ledger Packages + baseurl=https://repos.ripple.com/repos/rippled-rpm/nightly/ + enabled=1 + gpgcheck=0 + gpgkey=https://repos.ripple.com/repos/rippled-rpm/nightly/repodata/repomd.xml.key + repo_gpgcheck=1 + REPOFILE + + + + 2. Fetch the latest repo updates: $ sudo yum -y update diff --git a/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md b/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md new file mode 100644 index 00000000000..4eb0aba822f --- /dev/null +++ b/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md @@ -0,0 +1,104 @@ +# Health Check Interventions + +The [Health Check method](health-check.html) can be used by automated monitoring to recognize when a `rippled` server is not healthy and prompt interventions such as restarting the server or alerting a human administrator. + +Infrastructure monitoring, and reliability engineering more generally, is an advanced discipline that involves using multiple sources of data to make decisions in context. This document provides some suggestions for how to use the health check most effectively, but these recommendations are only meant as guidelines as part of a larger strategy. + +## Momentary Failures + +Some metrics in the health check can rapidly fluctuate into unhealthy ranges and then recover automatically shortly afterward. It is unnecessary and undesirable to raise alerts every single time the health check reports an unhealthy status. An automated monitoring system should call the health check method frequently, but only escalate to a higher level of intervention based on the severity and frequency of the problem. + +For example, if you check the health of the server once per second, you might raise an alert if the server reports "warning" status three times in a row, or four times in a five-second span. You might also raise an alert if the server reports "critical" status twice in a five-second span. + +## Special Cases + +Certain server configurations may always report a `warning` status even when operating normally. If your server qualifies as a special case, you must configure your automated monitoring to recognize the difference between the normal status and an actual problem. This probably involves parsing the JSON response body for the health check method and comparing the values there with expected normal ranges. + +Some examples of special cases that may occur include: + +- A [private peer](peer-protocol.html#private-peers) typically has a very small number of peer-to-peer connections to known servers only, but the health check reports a warning on the `peers` metric if the server is connected to 7 or fewer peers. You should know the exact number of peers your server is configured to have and check for that value. +- On a [parallel or test network](parallel-networks.html) that is not very busy, the network waits up to 20 seconds for new transactions before attempting to validate a new ledger version, but the health check reports a warning on the `validated_ledger` metric if the latest validated ledger is 7 or more seconds old. If you are running `rippled` on a non-production network, you may want to ignore `warning` messages for this metric unless you know that there should be transactions being regularly sent. You may still want to alert on the `critical` level of 20 seconds, because the XRP Ledger protocol is designed to validate new ledger versions at least once every 20 seconds even if there are no new transactions to process. + +## Suggested Interventions + +When a health check fails, and it's not just a [momentary failure](#momentary-failures), the action to take to recover from the outage varies based on the cause. Some failures can be fixed with steps that your infrastructure can take automatically based on specific criteria. Other failures may require the intervention of a human administrator who can investigate and take the necessary steps to resolve more complex or critical failures. How and when you respond is likely to depend on your unique situation and infrastructure, but the metrics reported in the health check result can be a factor in these decisions. + +The following sections suggest some common interventions you may want to attempt and the health check statuses most likely to prompt those interventions. Automated systems and human administrators may selectively escalate through these and other interventions: + +- [Redirect traffic](#redirect-traffic) away from the affected server +- [Restart](#restart) the server software or hardware +- [Investigate network](#investigate-network) in case the problem originates elsewhere +- [Replace hardware](#replace-hardware) +- [Upgrade](#upgrade) the `rippled` software + + +### Redirect Traffic + +A common reliability technique is to run a pool of redundant servers through one or more load-balancing proxies. You can do this with `rippled` servers, but should not do this with [validators](rippled-server-modes.html). In some cases, the load balancers can monitor the health of servers in their pools and direct traffic only to the servers that are currently reporting themselves as healthy. This allows servers to recover from being temporarily overloaded and automatically rejoin the pool of active servers. + +Redirecting traffic away from a server that is unhealthy is an appropriate response, especially for servers that report a `health` status of `warning`. Servers in the `critical` range may need more significant interventions. + + +### Restart + +The most straightforward intervention is to restart the server. This can resolve temporary issues with several types of failures, including any of the following metrics: + +- `load_factor` +- `peers` +- `server_state` +- `validated_ledger` + +To restart only the `rippled` service, use `systemctl`: + +``` +$ sudo systemctl restart rippled.service +``` + +A stronger intervention is to restart the entire machine. + +**Caution:** After a server starts, it typically needs up to 15 minutes to sync to the network. During this time, the health check is likely to report a critical or warning status. You should be sure your automated systems give servers enough time to sync before restarting them again. + + +### Investigate Network + +An unreliable or insufficient network connection can cause a server to report outages. Warning or critical values in the following metrics can indicate network problems: + +- `peers` +- `server_state` +- `validated_ledger` + +In this case, the necessary interventions may involve changes to other systems, such as: + +- Adjusting firewall rules to allow necessary traffic to reach a server, or to block harmful traffic from outside +- Restarting or replacing network interfaces, switches, routers, or cabling +- Contacting other network service providers to resolve an issue on their end + + + +### Replace Hardware + +If the outage is caused by a hardware failure or by higher load than the hardware is capable of handling, it may be necessary to replace some components or even the entire server. + +The amount of load on a server in the XRP Ledger depends in part on transaction volume in the network, which varies organically. Load also depends on your usage pattern. See [Capacity Planning](capacity-planning.html) for how to plan the appropriate hardware and settings for your situation. + +Warning or critical values for the following metrics may indicate insufficient hardware: + +- `load_factor` +- `server_state` +- `validated_ledger` + + +### Upgrade + +If the server reports `"amendment_blocked": true` in the health check, this indicates that the XRP Ledger has enabled a [protocol amendment](amendments.html) that your server does not understand. As a precaution against misinterpreting the revised rules of the network in a way that causes you to lose money, such servers become "amendment blocked" instead of operating normally. + +The proper way to resolve being amendment blocked is to [update your server](install-rippled.html) to a newer software version that understands the amendment. + + + + + + +{% include '_snippets/rippled-api-links.md' %} +{% include '_snippets/tx-type-links.md' %} +{% include '_snippets/rippled_versions.md' %} diff --git a/dactyl-config.yml b/dactyl-config.yml index 0011973f0e5..ac158de17d9 100644 --- a/dactyl-config.yml +++ b/dactyl-config.yml @@ -2758,6 +2758,18 @@ pages: targets: - en + # TODO: translate page and blurb + - md: tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md + html: health-check-interventions.html + funnel: Docs + doc_type: Tutorials + category: Manage the rippled Server + subcategory: Troubleshooting rippled + blurb: Use the rippled server's health check as part of automated infrastructure monitoring. + targets: + - en + - ja + - md: tutorials/manage-the-rippled-server/troubleshooting/diagnosing-problems.ja.md html: diagnosing-problems.html funnel: Docs From c82994068b01a57704b41d2d04abd8102e721dfa Mon Sep 17 00:00:00 2001 From: mDuo13 Date: Mon, 6 Jul 2020 16:20:50 -0700 Subject: [PATCH 3/5] Health Check: edits/cleanup --- .../references/rippled-api/peer-port-methods/health-check.md | 2 +- .../troubleshooting/health-check-interventions.md | 4 ++-- dactyl-config.yml | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/content/references/rippled-api/peer-port-methods/health-check.md b/content/references/rippled-api/peer-port-methods/health-check.md index 268834aacf5..a7405f8df10 100644 --- a/content/references/rippled-api/peer-port-methods/health-check.md +++ b/content/references/rippled-api/peer-port-methods/health-check.md @@ -49,7 +49,7 @@ The `info` object may contain the following fields: | `Field` | Value | Description | |:--------------------|:--------|:---------------------------------------------| -| `amendment_blocked` | Boolean | _(May be omitted)_ If `true`, the server is [amemdment blocked](amendments.html#amendment-blocked) and must be upgraded to remain synced with the network; this state is critical. If the server is not amendment blocked, this field is omitted. | +| `amendment_blocked` | Boolean | _(May be omitted)_ If `true`, the server is [amendment blocked](amendments.html#amendment-blocked) and must be upgraded to remain synced with the network; this state is critical. If the server is not amendment blocked, this field is omitted. | | `load_factor` | Number | _(May be omitted)_ A measure of the overall load the server is under. This reflects I/O, CPU, and memory limitations. This is a warning if the load factor is over 100, or critical if the load factor is 1000 or higher. | | `peers` | Number | _(May be omitted)_ The number of [peer servers](peer-protocol.html) this server is connected to. This is a warning if connected to 7 or fewer peers, and critical if connected to zero peers. | | `server_state` | String | _(May be omitted)_ The current [server state](rippled-server-states.html). This is a warning if the server is in the `tracking`, `syncing`, or `connected` states. This is critical if the server is in the `disconnected` state. | diff --git a/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md b/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md index 4eb0aba822f..2ea9680a4ca 100644 --- a/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md +++ b/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md @@ -17,11 +17,11 @@ Certain server configurations may always report a `warning` status even when ope Some examples of special cases that may occur include: - A [private peer](peer-protocol.html#private-peers) typically has a very small number of peer-to-peer connections to known servers only, but the health check reports a warning on the `peers` metric if the server is connected to 7 or fewer peers. You should know the exact number of peers your server is configured to have and check for that value. -- On a [parallel or test network](parallel-networks.html) that is not very busy, the network waits up to 20 seconds for new transactions before attempting to validate a new ledger version, but the health check reports a warning on the `validated_ledger` metric if the latest validated ledger is 7 or more seconds old. If you are running `rippled` on a non-production network, you may want to ignore `warning` messages for this metric unless you know that there should be transactions being regularly sent. You may still want to alert on the `critical` level of 20 seconds, because the XRP Ledger protocol is designed to validate new ledger versions at least once every 20 seconds even if there are no new transactions to process. +- On a [parallel or test network](parallel-networks.html) where new transactions are not being sent continuously, the network waits up to 20 seconds for new transactions before attempting to validate a new ledger version, but the health check reports a warning on the `validated_ledger` metric if the latest validated ledger is 7 or more seconds old. If you are running `rippled` on a non-production network, you may want to ignore `warning` messages for this metric unless you know that there should be transactions being regularly sent. You may still want to alert on the `critical` level of 20 seconds, because the XRP Ledger protocol is designed to validate new ledger versions at least once every 20 seconds even if there are no new transactions to process. ## Suggested Interventions -When a health check fails, and it's not just a [momentary failure](#momentary-failures), the action to take to recover from the outage varies based on the cause. Some failures can be fixed with steps that your infrastructure can take automatically based on specific criteria. Other failures may require the intervention of a human administrator who can investigate and take the necessary steps to resolve more complex or critical failures. How and when you respond is likely to depend on your unique situation and infrastructure, but the metrics reported in the health check result can be a factor in these decisions. +When a health check fails, and it's not just a [momentary failure](#momentary-failures), the action to take to recover from the outage varies based on the cause. You may be able to configure your infrastructure to fix some types of failures automatically. Other failures require the intervention of a human administrator who can investigate and take the necessary steps to resolve more complex or critical failures; depending on the structure of your organization, you may have different levels of human administrator so that less skilled, lower level administrators can fix certain issues independently, but need to escalate to higher level administrators to fix larger or more complex issues. How and when you respond is likely to depend on your unique situation, but the metrics reported in the health check result can be a factor in these decisions. The following sections suggest some common interventions you may want to attempt and the health check statuses most likely to prompt those interventions. Automated systems and human administrators may selectively escalate through these and other interventions: diff --git a/dactyl-config.yml b/dactyl-config.yml index ac158de17d9..74424430a32 100644 --- a/dactyl-config.yml +++ b/dactyl-config.yml @@ -129,6 +129,8 @@ targets: "transaction-metadata.html#affectednodes": "transaction-metadata.html" # Fix link from untranslated peer-crawler.html: "peer-protocol.html#private-peers": "peer-protocol.html#プライベートピア" + # Fix link from untranslated health-check.html: + "rippled-server-modes.html#reasons-to-run-a-rippled-server-in-stand-alone-mode": "rippled-server-modes.html#rippledサーバーをスタンドアロンモードで実行する理由" - name: xrp-api-only From fa277b719772b76a8b62a70b2c43c7c2505ad647 Mon Sep 17 00:00:00 2001 From: mDuo13 Date: Wed, 8 Jul 2020 13:39:28 -0700 Subject: [PATCH 4/5] Health Check edits: - revised per reviews - updated for rippled#3491 --- .../peer-port-methods/health-check.md | 63 ++++++++++++++++--- .../health-check-interventions.md | 30 +++++---- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/content/references/rippled-api/peer-port-methods/health-check.md b/content/references/rippled-api/peer-port-methods/health-check.md index a7405f8df10..879094e7114 100644 --- a/content/references/rippled-api/peer-port-methods/health-check.md +++ b/content/references/rippled-api/peer-port-methods/health-check.md @@ -3,14 +3,14 @@ The Health Check is a special [peer port method](peer-port-methods.html) for reporting on the health of an individual `rippled` server. This method is intended for use in automated monitoring to recognize outages and prompt automated or manual interventions such as restarting the server. [New in: rippled 1.6.0][] -This method checks several metrics to see if they are in ranges generally considered healthy. If all metrics are in normal ranges, this method reports that the server is healthy. If any metric is outside normal ranges, this method reports that the server is unhealthy and reports the metric(s) that were unhealthy. Since some metrics may rapidly fluctuate into and out of unhealthy ranges, it is recommended not to raise alerts unless the health check fails multiple times in a row. +This method checks several metrics to see if they are in ranges generally considered healthy. If all metrics are in normal ranges, this method reports that the server is healthy. If any metric is outside normal ranges, this method reports that the server is unhealthy and reports the metric(s) that are unhealthy. Since some metrics may rapidly fluctuate into and out of unhealthy ranges, you should not raise alerts unless the health check fails multiple times in a row. **Note:** Since the health check is a [peer port method](peer-port-methods.html), it is not available when testing the server in [stand-alone mode](rippled-server-modes.html#reasons-to-run-a-rippled-server-in-stand-alone-mode). ## Request Format -To request the Peer Crawler information, make the following HTTP request: +To request the Health Check information, make the following HTTP request: - **Protocol:** https - **HTTP Method:** GET @@ -23,6 +23,24 @@ To request the Peer Crawler information, make the following HTTP request: ## Example Response + + +*Healthy* + +```json +HTTP/1.1 200 OK +Server: rippled-1.6.0-b8 +Content-Type: application/json +Connection: close +Transfer-Encoding: chunked + +{ + "info": {} +} +``` + +*Warning* + ```json HTTP/1.1 503 Service Unavailable Server: rippled-1.6.0 @@ -31,19 +49,44 @@ Connection: close Transfer-Encoding: chunked { - "info": { - "load_factor": 256, - "server_state": "connected", - "validated_ledger": 2147483647 - } + "info": { + "server_state": "connected", + "validated_ledger": -1 + } } ``` +*Critical* + +```json +HTTP/1.1 500 Internal Server Error +Server: rippled-1.6.0 +Content-Type: application/json +Connection: close +Transfer-Encoding: chunked + +{ + "info": { + "peers": 0, + "server_state": "disconnected", + "validated_ledger":-1 + } +} +``` + + + ## Response Format -If the server is in a **critical** state, the response has the status code **503 Service Unavailable**. If the server is **healthy** or in a **warning** state, the response has the status code **200 OK**. +The response's HTTP status code indicates the health of the server: + +| Status Code | Health Status | Description | +|:------------------------------|:--------------|:-----------------------------| +| **200 OK** | Healthy | All health metrics are within acceptable ranges. | +| **503 Service Unavailable** | Warning | One or more metric is in the warning range. Manual intervention may or may not be necessary. | +| **500 Internal Server Error** | Critical | One or more metric is in the critical range. There is a serious problem that probably needs manual intervention to fix. | -In either case, the response body is a JSON object with a single `info` object at the top level. The info object contains values for each metric that is in a warning or critical range. The response omits metrics that are in a healthy range, so a fully healthy server has an empty object. +The response body is a JSON object with a single `info` object at the top level. The `info` object contains values for each metric that is in a warning or critical range. The response omits metrics that are in a healthy range, so a fully healthy server has an empty object. The `info` object may contain the following fields: @@ -53,7 +96,7 @@ The `info` object may contain the following fields: | `load_factor` | Number | _(May be omitted)_ A measure of the overall load the server is under. This reflects I/O, CPU, and memory limitations. This is a warning if the load factor is over 100, or critical if the load factor is 1000 or higher. | | `peers` | Number | _(May be omitted)_ The number of [peer servers](peer-protocol.html) this server is connected to. This is a warning if connected to 7 or fewer peers, and critical if connected to zero peers. | | `server_state` | String | _(May be omitted)_ The current [server state](rippled-server-states.html). This is a warning if the server is in the `tracking`, `syncing`, or `connected` states. This is critical if the server is in the `disconnected` state. | -| `validated_ledger` | Number | _(May be omitted)_ The number of seconds since the last time a ledger was validated by consensus. If there is no validated ledger available, this is a very large integer value such as `2147483647` (architecture-dependent). This is a warning if the last validated ledger was at least 7 seconds ago, and critical if the last validated ledger was at least 20 seconds ago. | +| `validated_ledger` | Number | _(May be omitted)_ The number of seconds since the last time a ledger was validated by [consensus](intro-to-consensus.html). If there is no validated ledger available ([as during the initial sync period when starting the server](server-doesnt-sync.html#normal-syncing-behavior)), this is the value `-1` and is considered a warning. This metric is also a warning if the last validated ledger was at least 7 seconds ago, or critical if the last validated ledger was at least 20 seconds ago. | ## See Also diff --git a/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md b/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md index 2ea9680a4ca..8043c808a42 100644 --- a/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md +++ b/content/tutorials/manage-the-rippled-server/troubleshooting/health-check-interventions.md @@ -6,10 +6,12 @@ Infrastructure monitoring, and reliability engineering more generally, is an adv ## Momentary Failures -Some metrics in the health check can rapidly fluctuate into unhealthy ranges and then recover automatically shortly afterward. It is unnecessary and undesirable to raise alerts every single time the health check reports an unhealthy status. An automated monitoring system should call the health check method frequently, but only escalate to a higher level of intervention based on the severity and frequency of the problem. +Some [metrics][] in the health check can rapidly fluctuate into unhealthy ranges and then recover automatically shortly afterward. It is unnecessary and undesirable to raise alerts every single time the health check reports an unhealthy status. An automated monitoring system should call the health check method frequently, but only escalate to a higher level of intervention based on the severity and frequency of the problem. For example, if you check the health of the server once per second, you might raise an alert if the server reports "warning" status three times in a row, or four times in a five-second span. You might also raise an alert if the server reports "critical" status twice in a five-second span. +**Tip:** The server normally reports a "critical" status for the first few seconds after startup, switches to a "warning" status after it establishes a connection to the network, and finally reports a "healthy" status when it has fully synced to the network. After a restart, you should give a server 5–15 minutes to sync before taking additional interventions. + ## Special Cases Certain server configurations may always report a `warning` status even when operating normally. If your server qualifies as a special case, you must configure your automated monitoring to recognize the difference between the normal status and an actual problem. This probably involves parsing the JSON response body for the health check method and comparing the values there with expected normal ranges. @@ -27,9 +29,9 @@ The following sections suggest some common interventions you may want to attempt - [Redirect traffic](#redirect-traffic) away from the affected server - [Restart](#restart) the server software or hardware +- [Upgrade](#upgrade) the `rippled` software - [Investigate network](#investigate-network) in case the problem originates elsewhere - [Replace hardware](#replace-hardware) -- [Upgrade](#upgrade) the `rippled` software ### Redirect Traffic @@ -41,7 +43,7 @@ Redirecting traffic away from a server that is unhealthy is an appropriate respo ### Restart -The most straightforward intervention is to restart the server. This can resolve temporary issues with several types of failures, including any of the following metrics: +The most straightforward intervention is to restart the server. This can resolve temporary issues with several types of failures, including any of the following [metrics][]: - `load_factor` - `peers` @@ -59,9 +61,18 @@ A stronger intervention is to restart the entire machine. **Caution:** After a server starts, it typically needs up to 15 minutes to sync to the network. During this time, the health check is likely to report a critical or warning status. You should be sure your automated systems give servers enough time to sync before restarting them again. +### Upgrade + +If the server reports `"amendment_blocked": true` in the health check, this indicates that the XRP Ledger has enabled a [protocol amendment](amendments.html) that your server does not understand. As a precaution against misinterpreting the revised rules of the network in a way that causes you to lose money, such servers become "amendment blocked" instead of operating normally. + +To resolve being amendment blocked, [update your server](install-rippled.html) to a newer software version that understands the amendment. + +Also, software bugs can cause a server to get [stuck not syncing](server-doesnt-sync.html). In this case, the `server_state` metric is likely to be in a warning or critical state. If you are not using the latest stable release, you should upgrade to get the latest fixes for any known issues that could cause this. + + ### Investigate Network -An unreliable or insufficient network connection can cause a server to report outages. Warning or critical values in the following metrics can indicate network problems: +An unreliable or insufficient network connection can cause a server to report outages. Warning or critical values in the following [metrics][] can indicate network problems: - `peers` - `server_state` @@ -77,28 +88,23 @@ In this case, the necessary interventions may involve changes to other systems, ### Replace Hardware -If the outage is caused by a hardware failure or by higher load than the hardware is capable of handling, it may be necessary to replace some components or even the entire server. +If the outage is caused by a hardware failure or by higher load than the hardware is capable of handling, you may need to replace some components or even the entire server. The amount of load on a server in the XRP Ledger depends in part on transaction volume in the network, which varies organically. Load also depends on your usage pattern. See [Capacity Planning](capacity-planning.html) for how to plan the appropriate hardware and settings for your situation. -Warning or critical values for the following metrics may indicate insufficient hardware: +Warning or critical values for the following [metrics][] may indicate insufficient hardware: - `load_factor` - `server_state` - `validated_ledger` -### Upgrade - -If the server reports `"amendment_blocked": true` in the health check, this indicates that the XRP Ledger has enabled a [protocol amendment](amendments.html) that your server does not understand. As a precaution against misinterpreting the revised rules of the network in a way that causes you to lose money, such servers become "amendment blocked" instead of operating normally. - -The proper way to resolve being amendment blocked is to [update your server](install-rippled.html) to a newer software version that understands the amendment. - +[metrics]: health-check.html#response-format {% include '_snippets/rippled-api-links.md' %} {% include '_snippets/tx-type-links.md' %} {% include '_snippets/rippled_versions.md' %} From 08782a3da5085e77588b056f0b1f4997a8f76c36 Mon Sep 17 00:00:00 2001 From: mDuo13 Date: Thu, 9 Jul 2020 23:40:22 -0700 Subject: [PATCH 5/5] Health check: fix broken link from untranslated page --- dactyl-config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dactyl-config.yml b/dactyl-config.yml index 74424430a32..72976a79184 100644 --- a/dactyl-config.yml +++ b/dactyl-config.yml @@ -129,8 +129,10 @@ targets: "transaction-metadata.html#affectednodes": "transaction-metadata.html" # Fix link from untranslated peer-crawler.html: "peer-protocol.html#private-peers": "peer-protocol.html#プライベートピア" - # Fix link from untranslated health-check.html: + # Fix links from untranslated health-check.html: "rippled-server-modes.html#reasons-to-run-a-rippled-server-in-stand-alone-mode": "rippled-server-modes.html#rippledサーバーをスタンドアロンモードで実行する理由" + "server-doesnt-sync.html#normal-syncing-behavior": "server-doesnt-sync.html#通常の同期動作" + - name: xrp-api-only