Skip to content

Commit

Permalink
health check: structured active healthcheck logging (#3176)
Browse files Browse the repository at this point in the history
Signed-off-by: James Sedgwick <jsedgwick@lyft.com>
  • Loading branch information
James Sedgwick authored and mattklein123 committed Jul 5, 2018
1 parent 3b05bff commit a5da078
Show file tree
Hide file tree
Showing 36 changed files with 532 additions and 163 deletions.
1 change: 1 addition & 0 deletions api/docs/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ proto_library(
"//envoy/config/trace/v2:trace",
"//envoy/config/transport_socket/capture/v2alpha:capture",
"//envoy/data/accesslog/v2:accesslog",
"//envoy/data/core/v2alpha:health_check_event",
"//envoy/data/tap/v2alpha:capture",
"//envoy/service/accesslog/v2:als",
"//envoy/service/auth/v2alpha:attribute_context",
Expand Down
4 changes: 4 additions & 0 deletions api/envoy/api/v2/core/health_check.proto
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,10 @@ message HealthCheck {
//
// The default value for "healthy edge interval" is the same as the default interval.
google.protobuf.Duration healthy_edge_interval = 16;

// Specifies the path to the :ref:`health check event log <arch_overview_health_check_logging>`.
// If empty, no event log will be written.
string event_log_path = 17;
}

// Endpoint health status.
Expand Down
15 changes: 15 additions & 0 deletions api/envoy/data/core/v2alpha/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
load("//bazel:api_build_system.bzl", "api_proto_library")

licenses(["notice"]) # Apache 2

api_proto_library(
name = "health_check_event",
srcs = ["health_check_event.proto"],
visibility = [
"//visibility:public",
],
deps = [
"//envoy/api/v2/core:address",
"//envoy/api/v2/core:base",
],
)
58 changes: 58 additions & 0 deletions api/envoy/data/core/v2alpha/health_check_event.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
syntax = "proto3";

package envoy.data.core.v2alpha;

import "envoy/api/v2/core/address.proto";
import "envoy/api/v2/core/base.proto";

import "google/protobuf/duration.proto";
import "google/protobuf/wrappers.proto";

import "validate/validate.proto";
import "gogoproto/gogo.proto";

option (gogoproto.equal_all) = true;

// [#protodoc-title: Health check logging events]
// :ref:`Health check logging <arch_overview_health_check_logging>`.

message HealthCheckEvent {
HealthCheckerType health_checker_type = 1 [(validate.rules).enum.defined_only = true];
envoy.api.v2.core.Address host = 2;
string cluster_name = 3 [(validate.rules).string.min_bytes = 1];

oneof event {
option (validate.required) = true;

// Host ejection.
HealthCheckEjectUnhealthy eject_unhealthy_event = 4;

// Host addition.
HealthCheckAddHealthy add_healthy_event = 5;
}
}

enum HealthCheckFailureType {
ACTIVE = 0;
PASSIVE = 1;
NETWORK = 2;
}

enum HealthCheckerType {
HTTP = 0;
TCP = 1;
GRPC = 2;
REDIS = 3;
}

message HealthCheckEjectUnhealthy {
// The type of failure that caused this ejection.
HealthCheckFailureType failure_type = 1 [(validate.rules).enum.defined_only = true];
}

message HealthCheckAddHealthy {
// Whether this addition is the result of the first ever health check on a host, in which case
// the configured :ref:`healthy threshold <envoy_api_field_core.HealthCheck.healthy_threshold>`
// is bypassed and the host is immediately added.
bool first_check = 1;
}
1 change: 1 addition & 0 deletions docs/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ PROTO_RST="
/envoy/config/rbac/v2alpha/rbac/envoy/config/rbac/v2alpha/rbac.proto.rst
/envoy/config/transport_socket/capture/v2alpha/capture/envoy/config/transport_socket/capture/v2alpha/capture.proto.rst
/envoy/data/accesslog/v2/accesslog/envoy/data/accesslog/v2/accesslog.proto.rst
/envoy/data/core/v2alpha/health_check_event/envoy/data/core/v2alpha/health_check_event.proto.rst
/envoy/data/tap/v2alpha/capture/envoy/data/tap/v2alpha/capture.proto.rst
/envoy/service/accesslog/v2/als/envoy/service/accesslog/v2/als.proto.rst
/envoy/service/auth/v2alpha/external_auth/envoy/service/auth/v2alpha/attribute_context.proto.rst
Expand Down
8 changes: 8 additions & 0 deletions docs/root/api-v2/data/core/core.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Core data
=========

.. toctree::
:glob:
:maxdepth: 2

v2alpha/health_check_event.proto
1 change: 1 addition & 0 deletions docs/root/api-v2/data/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ Envoy data
:maxdepth: 2

accesslog/accesslog
core/core
tap/tap
9 changes: 9 additions & 0 deletions docs/root/intro/arch_overview/health_checking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ unhealthy, successes required before marking a host healthy, etc.):
maintenance by setting the specified key to any value and waiting for traffic to drain. See
:ref:`redis_key <config_cluster_manager_cluster_hc_redis_key>`.

.. _arch_overview_health_check_logging:

Health check event logging
--------------------------

A per-healthchecker log of ejection and addition events can optionally be produced by Envoy by
specifying a log file path in `the HealthCheck config <envoy_api_field_core.HealthCheck.event_log_path>`.
The log is structured as JSON dumps of `HealthCheckEvent messages <envoy_api_msg_core.HealthCheckEvent>`.

Passive health checking
-----------------------

Expand Down
2 changes: 2 additions & 0 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Version history
to filter based on the presence of Envoy response flags.
* admin: added :http:get:`/hystrix_event_stream` as an endpoint for monitoring envoy's statistics
through `Hystrix dashboard <https://github.com/Netflix-Skunkworks/hystrix-dashboard/wiki>`_.
* health check: added support for :ref:`custom health check <envoy_api_field_core.HealthCheck.custom_health_check>`.
* health_check: added support for :ref:`health check event logging <arch_overview_health_check_logging>`.
* http: response filters not applied to early error paths such as http_parser generated 400s.
* lua: added :ref:`requestInfo() <config_http_filters_lua_request_info_wrapper>` wrapper and *protocol()* API.
* ratelimit: added support for :repo:`api/envoy/service/ratelimit/v2/rls.proto`.
Expand Down
8 changes: 7 additions & 1 deletion include/envoy/server/health_checker_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ class HealthCheckerFactoryContext {
* for all singleton processing.
*/
virtual Event::Dispatcher& dispatcher() PURE;

/*
* @return Upstream::HealthCheckEventLoggerPtr the health check event logger for the
* created health checkers. This function may not be idempotent.
*/
virtual Upstream::HealthCheckEventLoggerPtr eventLogger() PURE;
};

/**
Expand Down Expand Up @@ -63,4 +69,4 @@ class CustomHealthCheckerFactory {

} // namespace Configuration
} // namespace Server
} // namespace Envoy
} // namespace Envoy
6 changes: 5 additions & 1 deletion include/envoy/upstream/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ envoy_cc_library(
name = "cluster_manager_interface",
hdrs = ["cluster_manager.h"],
deps = [
":health_checker_interface",
":load_balancer_interface",
":thread_local_cluster_interface",
":upstream_interface",
Expand All @@ -32,7 +33,10 @@ envoy_cc_library(
envoy_cc_library(
name = "health_checker_interface",
hdrs = ["health_checker.h"],
deps = [":upstream_interface"],
deps = [
":upstream_interface",
"@envoy_api//envoy/data/core/v2alpha:health_check_event_cc",
],
)

envoy_cc_library(
Expand Down
2 changes: 2 additions & 0 deletions include/envoy/upstream/cluster_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "envoy/runtime/runtime.h"
#include "envoy/secret/secret_manager.h"
#include "envoy/server/admin.h"
#include "envoy/upstream/health_checker.h"
#include "envoy/upstream/load_balancer.h"
#include "envoy/upstream/thread_local_cluster.h"
#include "envoy/upstream/upstream.h"
Expand Down Expand Up @@ -254,6 +255,7 @@ class ClusterManagerFactory {
virtual ClusterSharedPtr clusterFromProto(const envoy::api::v2::Cluster& cluster,
ClusterManager& cm,
Outlier::EventLoggerSharedPtr outlier_event_logger,
AccessLog::AccessLogManager& log_manager,
bool added_via_api) PURE;

/**
Expand Down
32 changes: 32 additions & 0 deletions include/envoy/upstream/health_checker.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <functional>
#include <memory>

#include "envoy/data/core/v2alpha/health_check_event.pb.h"
#include "envoy/upstream/upstream.h"

namespace Envoy {
Expand Down Expand Up @@ -59,5 +60,36 @@ typedef std::shared_ptr<HealthChecker> HealthCheckerSharedPtr;
std::ostream& operator<<(std::ostream& out, HealthState state);
std::ostream& operator<<(std::ostream& out, HealthTransition changed_state);

/**
* Sink for health check event logs.
*/
class HealthCheckEventLogger {
public:
virtual ~HealthCheckEventLogger() {}

/**
* Log an unhealthy host ejection event.
* @param health_checker_type supplies the type of health checker that generated the event.
* @param host supplies the host that generated the event.
* @param failure_type supplies the type of health check failure
*/
virtual void
logEjectUnhealthy(envoy::data::core::v2alpha::HealthCheckerType health_checker_type,
const HostDescriptionConstSharedPtr& host,
envoy::data::core::v2alpha::HealthCheckFailureType failure_type) PURE;

/**
* Log a healthy host addition event.
* @param health_checker_type supplies the type of health checker that generated the event.
* @param host supplies the host that generated the event.
* @param healthy_threshold supplied the configured healthy threshold for this health check
* @param first_check whether this is a fast path success on the first health check for this host
*/
virtual void logAddHealthy(envoy::data::core::v2alpha::HealthCheckerType health_checker_type,
const HostDescriptionConstSharedPtr& host, bool first_check) PURE;
};

typedef std::unique_ptr<HealthCheckEventLogger> HealthCheckEventLoggerPtr;

} // namespace Upstream
} // namespace Envoy
8 changes: 7 additions & 1 deletion source/common/protobuf/utility.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ void MessageUtil::loadFromFile(const std::string& path, Protobuf::Message& messa
}

std::string MessageUtil::getJsonStringFromMessage(const Protobuf::Message& message,
const bool pretty_print) {
const bool pretty_print,
const bool always_print_primitive_fields) {
Protobuf::util::JsonPrintOptions json_options;
// By default, proto field names are converted to camelCase when the message is converted to JSON.
// Setting this option makes debugging easier because it keeps field names consistent in JSON
Expand All @@ -98,6 +99,11 @@ std::string MessageUtil::getJsonStringFromMessage(const Protobuf::Message& messa
if (pretty_print) {
json_options.add_whitespace = true;
}
// Primitive types such as int32s and enums will not be serialized if they have the default value.
// This flag disables that behavior.
if (always_print_primitive_fields) {
json_options.always_print_primitive_fields = true;
}
ProtobufTypes::String json;
const auto status = Protobuf::util::MessageToJsonString(message, &json, json_options);
// This should always succeed unless something crash-worthy such as out-of-memory.
Expand Down
5 changes: 4 additions & 1 deletion source/common/protobuf/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,13 @@ class MessageUtil {
* Extract JSON as string from a google.protobuf.Message.
* @param message message of type type.googleapis.com/google.protobuf.Message.
* @param pretty_print whether the returned JSON should be formatted.
* @param always_print_primitive_fields whether to include primitive fields set to their default
* values, e.g. an int32 set to 0 or a bool set to false.
* @return std::string of formatted JSON object.
*/
static std::string getJsonStringFromMessage(const Protobuf::Message& message,
bool pretty_print = false);
bool pretty_print = false,
bool always_print_primitive_fields = false);

/**
* Extract JSON object from a google.protobuf.Message.
Expand Down
1 change: 1 addition & 0 deletions source/common/upstream/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ envoy_cc_library(
"//include/envoy/upstream:health_checker_interface",
"//source/common/router:router_lib",
"@envoy_api//envoy/api/v2/core:health_check_cc",
"@envoy_api//envoy/data/core/v2alpha:health_check_event_cc",
],
)

Expand Down
14 changes: 8 additions & 6 deletions source/common/upstream/cluster_manager_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,9 @@ ClusterManagerImpl::ClusterManagerImpl(const envoy::config::bootstrap::v2::Boots
Server::Admin& admin, SystemTimeSource& system_time_source,
MonotonicTimeSource& monotonic_time_source)
: factory_(factory), runtime_(runtime), stats_(stats), tls_(tls.allocateSlot()),
random_(random), bind_config_(bootstrap.cluster_manager().upstream_bind_config()),
local_info_(local_info), cm_stats_(generateStats(stats)),
random_(random), log_manager_(log_manager),
bind_config_(bootstrap.cluster_manager().upstream_bind_config()), local_info_(local_info),
cm_stats_(generateStats(stats)),
init_helper_([this](Cluster& cluster) { onClusterInit(cluster); }),
config_tracker_entry_(
admin.getConfigTracker().add("clusters", [this] { return dumpClusterConfigs(); })),
Expand Down Expand Up @@ -474,7 +475,7 @@ void ClusterManagerImpl::loadCluster(const envoy::api::v2::Cluster& cluster,
const std::string& version_info, bool added_via_api,
ClusterMap& cluster_map) {
ClusterSharedPtr new_cluster =
factory_.clusterFromProto(cluster, *this, outlier_event_logger_, added_via_api);
factory_.clusterFromProto(cluster, *this, outlier_event_logger_, log_manager_, added_via_api);

if (!added_via_api) {
if (cluster_map.find(new_cluster->info()->name()) != cluster_map.end()) {
Expand Down Expand Up @@ -976,10 +977,11 @@ Http::ConnectionPool::InstancePtr ProdClusterManagerFactory::allocateConnPool(

ClusterSharedPtr ProdClusterManagerFactory::clusterFromProto(
const envoy::api::v2::Cluster& cluster, ClusterManager& cm,
Outlier::EventLoggerSharedPtr outlier_event_logger, bool added_via_api) {
Outlier::EventLoggerSharedPtr outlier_event_logger, AccessLog::AccessLogManager& log_manager,
bool added_via_api) {
return ClusterImplBase::create(cluster, cm, stats_, tls_, dns_resolver_, ssl_context_manager_,
runtime_, random_, main_thread_dispatcher_, local_info_,
outlier_event_logger, added_via_api);
runtime_, random_, main_thread_dispatcher_, log_manager,
local_info_, outlier_event_logger, added_via_api);
}

CdsApiPtr ProdClusterManagerFactory::createCds(
Expand Down
2 changes: 2 additions & 0 deletions source/common/upstream/cluster_manager_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class ProdClusterManagerFactory : public ClusterManagerFactory {
const Network::ConnectionSocket::OptionsSharedPtr& options) override;
ClusterSharedPtr clusterFromProto(const envoy::api::v2::Cluster& cluster, ClusterManager& cm,
Outlier::EventLoggerSharedPtr outlier_event_logger,
AccessLog::AccessLogManager& log_manager,
bool added_via_api) override;
CdsApiPtr createCds(const envoy::api::v2::core::ConfigSource& cds_config,
const absl::optional<envoy::api::v2::core::ConfigSource>& eds_config,
Expand Down Expand Up @@ -357,6 +358,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable<Logger::Id::u
Stats::Store& stats_;
ThreadLocal::SlotPtr tls_;
Runtime::RandomGenerator& random_;
AccessLog::AccessLogManager& log_manager_;
ClusterMap active_clusters_;
ClusterMap warming_clusters_;
absl::optional<envoy::api::v2::core::ConfigSource> eds_config_;
Expand Down
Loading

0 comments on commit a5da078

Please sign in to comment.