diff --git a/api/envoy/config/bootstrap/v3/bootstrap.proto b/api/envoy/config/bootstrap/v3/bootstrap.proto index 2d096a39c73b..56166456f23f 100644 --- a/api/envoy/config/bootstrap/v3/bootstrap.proto +++ b/api/envoy/config/bootstrap/v3/bootstrap.proto @@ -305,10 +305,35 @@ message ClusterManager { // Envoy process watchdog configuration. When configured, this monitors for // nonresponsive threads and kills the process after the configured thresholds. // See the :ref:`watchdog documentation ` for more information. -// [#next-free-field: 7] +// [#next-free-field: 8] message Watchdog { option (udpa.annotations.versioning).previous_message_type = "envoy.config.bootstrap.v2.Watchdog"; + message WatchdogAction { + // The events are fired in this order: KILL, MULTIKILL, MEGAMISS, MISS. + // Within an event type, actions execute in the order they are configured. + // For KILL/MULTIKILL there is a default PANIC that will run after the + // registered actions and kills the process if it wasn't already killed. + // It might be useful to specify several debug actions, and possibly an + // alternate FATAL action. + enum WatchdogEvent { + UNKNOWN = 0; + KILL = 1; + MULTIKILL = 2; + MEGAMISS = 3; + MISS = 4; + } + + // Extension specific configuration for the action. + core.v3.TypedExtensionConfig config = 1; + + WatchdogEvent event = 2 [(validate.rules).enum = {defined_only: true}]; + } + + // Register actions that will fire on given WatchDog events. + // See *WatchDogAction* for priority of events. + repeated WatchdogAction actions = 7; + // The duration after which Envoy counts a nonresponsive thread in the // *watchdog_miss* statistic. If not specified the default is 200ms. google.protobuf.Duration miss_timeout = 1; diff --git a/api/envoy/config/bootstrap/v4alpha/bootstrap.proto b/api/envoy/config/bootstrap/v4alpha/bootstrap.proto index ba6107aa8dfe..24faad401e7d 100644 --- a/api/envoy/config/bootstrap/v4alpha/bootstrap.proto +++ b/api/envoy/config/bootstrap/v4alpha/bootstrap.proto @@ -296,10 +296,38 @@ message ClusterManager { // Envoy process watchdog configuration. When configured, this monitors for // nonresponsive threads and kills the process after the configured thresholds. // See the :ref:`watchdog documentation ` for more information. -// [#next-free-field: 7] +// [#next-free-field: 8] message Watchdog { option (udpa.annotations.versioning).previous_message_type = "envoy.config.bootstrap.v3.Watchdog"; + message WatchdogAction { + option (udpa.annotations.versioning).previous_message_type = + "envoy.config.bootstrap.v3.Watchdog.WatchdogAction"; + + // The events are fired in this order: KILL, MULTIKILL, MEGAMISS, MISS. + // Within an event type, actions execute in the order they are configured. + // For KILL/MULTIKILL there is a default PANIC that will run after the + // registered actions and kills the process if it wasn't already killed. + // It might be useful to specify several debug actions, and possibly an + // alternate FATAL action. + enum WatchdogEvent { + UNKNOWN = 0; + KILL = 1; + MULTIKILL = 2; + MEGAMISS = 3; + MISS = 4; + } + + // Extension specific configuration for the action. + core.v4alpha.TypedExtensionConfig config = 1; + + WatchdogEvent event = 2 [(validate.rules).enum = {defined_only: true}]; + } + + // Register actions that will fire on given WatchDog events. + // See *WatchDogAction* for priority of events. + repeated WatchdogAction actions = 7; + // The duration after which Envoy counts a nonresponsive thread in the // *watchdog_miss* statistic. If not specified the default is 200ms. google.protobuf.Duration miss_timeout = 1; diff --git a/docs/root/extending/extending.rst b/docs/root/extending/extending.rst index 316b43547835..ddcd3d874718 100644 --- a/docs/root/extending/extending.rst +++ b/docs/root/extending/extending.rst @@ -19,8 +19,9 @@ types including: * :ref:`Stat sinks ` * :ref:`Tracers ` * :ref:`Request ID ` -* Transport sockets +* :ref:`Transport sockets ` * BoringSSL private key methods +* :ref:`Watchdog action ` * :ref:`Internal redirect policy ` As of this writing there is no high level extension developer documentation. The diff --git a/docs/root/operations/performance.rst b/docs/root/operations/performance.rst index 01acce4acc1f..8846275290da 100644 --- a/docs/root/operations/performance.rst +++ b/docs/root/operations/performance.rst @@ -56,10 +56,13 @@ Watchdog -------- In addition to event loop statistics, Envoy also include a configurable -:ref:`watchdog ` system that can increment -statistics when Envoy is not responsive and optionally kill the server. The statistics are useful -for understanding at a high level whether Envoy's event loop is not responsive either because it is -doing too much work, blocking, or not being scheduled by the OS. +:ref:`watchdog ` +system that can increment statistics when Envoy is not responsive and +optionally kill the server. The system also has an extension point allowing for +custom actions to be taken based on watchdog events. The statistics are +useful for understanding at a high level whether Envoy's event loop is not +responsive either because it is doing too much work, blocking, or not being +scheduled by the OS. The watchdog emits statistics in both the *server.* and *server..* trees. ** is equal to *main_thread*, *worker_0*, *worker_1*, etc. diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst index 9ca7c87a0c55..07fec599ce26 100644 --- a/docs/root/version_history/current.rst +++ b/docs/root/version_history/current.rst @@ -75,6 +75,7 @@ New Features * tap: added :ref:`generic body matcher` to scan http requests and responses for text or hex patterns. * tcp: switched the TCP connection pool to the new "shared" connection pool, sharing a common code base with HTTP and HTTP/2. Any unexpected behavioral changes can be temporarily reverted by setting `envoy.reloadable_features.new_tcp_connection_pool` to false. * watchdog: support randomizing the watchdog's kill timeout to prevent synchronized kills via a maximium jitter parameter :ref:`max_kill_timeout_jitter`. +* watchdog: supports an extension point where actions can be registered to fire on watchdog events such as miss, megamiss, kill and multikill. See ref:`watchdog actions`. * xds: added :ref:`extension config discovery` support for HTTP filters. Deprecated diff --git a/generated_api_shadow/envoy/config/bootstrap/v3/bootstrap.proto b/generated_api_shadow/envoy/config/bootstrap/v3/bootstrap.proto index d3cf6d6947cf..eadc77a8828a 100644 --- a/generated_api_shadow/envoy/config/bootstrap/v3/bootstrap.proto +++ b/generated_api_shadow/envoy/config/bootstrap/v3/bootstrap.proto @@ -306,10 +306,35 @@ message ClusterManager { // Envoy process watchdog configuration. When configured, this monitors for // nonresponsive threads and kills the process after the configured thresholds. // See the :ref:`watchdog documentation ` for more information. -// [#next-free-field: 7] +// [#next-free-field: 8] message Watchdog { option (udpa.annotations.versioning).previous_message_type = "envoy.config.bootstrap.v2.Watchdog"; + message WatchdogAction { + // The events are fired in this order: KILL, MULTIKILL, MEGAMISS, MISS. + // Within an event type, actions execute in the order they are configured. + // For KILL/MULTIKILL there is a default PANIC that will run after the + // registered actions and kills the process if it wasn't already killed. + // It might be useful to specify several debug actions, and possibly an + // alternate FATAL action. + enum WatchdogEvent { + UNKNOWN = 0; + KILL = 1; + MULTIKILL = 2; + MEGAMISS = 3; + MISS = 4; + } + + // Extension specific configuration for the action. + core.v3.TypedExtensionConfig config = 1; + + WatchdogEvent event = 2 [(validate.rules).enum = {defined_only: true}]; + } + + // Register actions that will fire on given WatchDog events. + // See *WatchDogAction* for priority of events. + repeated WatchdogAction actions = 7; + // The duration after which Envoy counts a nonresponsive thread in the // *watchdog_miss* statistic. If not specified the default is 200ms. google.protobuf.Duration miss_timeout = 1; diff --git a/generated_api_shadow/envoy/config/bootstrap/v4alpha/bootstrap.proto b/generated_api_shadow/envoy/config/bootstrap/v4alpha/bootstrap.proto index 89dd0d7f7d0d..e798d6195f3f 100644 --- a/generated_api_shadow/envoy/config/bootstrap/v4alpha/bootstrap.proto +++ b/generated_api_shadow/envoy/config/bootstrap/v4alpha/bootstrap.proto @@ -304,10 +304,38 @@ message ClusterManager { // Envoy process watchdog configuration. When configured, this monitors for // nonresponsive threads and kills the process after the configured thresholds. // See the :ref:`watchdog documentation ` for more information. -// [#next-free-field: 7] +// [#next-free-field: 8] message Watchdog { option (udpa.annotations.versioning).previous_message_type = "envoy.config.bootstrap.v3.Watchdog"; + message WatchdogAction { + option (udpa.annotations.versioning).previous_message_type = + "envoy.config.bootstrap.v3.Watchdog.WatchdogAction"; + + // The events are fired in this order: KILL, MULTIKILL, MEGAMISS, MISS. + // Within an event type, actions execute in the order they are configured. + // For KILL/MULTIKILL there is a default PANIC that will run after the + // registered actions and kills the process if it wasn't already killed. + // It might be useful to specify several debug actions, and possibly an + // alternate FATAL action. + enum WatchdogEvent { + UNKNOWN = 0; + KILL = 1; + MULTIKILL = 2; + MEGAMISS = 3; + MISS = 4; + } + + // Extension specific configuration for the action. + core.v4alpha.TypedExtensionConfig config = 1; + + WatchdogEvent event = 2 [(validate.rules).enum = {defined_only: true}]; + } + + // Register actions that will fire on given WatchDog events. + // See *WatchDogAction* for priority of events. + repeated WatchdogAction actions = 7; + // The duration after which Envoy counts a nonresponsive thread in the // *watchdog_miss* statistic. If not specified the default is 200ms. google.protobuf.Duration miss_timeout = 1; diff --git a/include/envoy/server/BUILD b/include/envoy/server/BUILD index 534270f24e74..34f42371d6ba 100644 --- a/include/envoy/server/BUILD +++ b/include/envoy/server/BUILD @@ -74,6 +74,17 @@ envoy_cc_library( ], ) +envoy_cc_library( + name = "guarddog_config_interface", + hdrs = ["guarddog_config.h"], + deps = [ + ":guarddog_interface", + "//include/envoy/api:api_interface", + "//include/envoy/protobuf:message_validator_interface", + "@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto", + ], +) + envoy_cc_library( name = "health_checker_config_interface", hdrs = ["health_checker_config.h"], diff --git a/include/envoy/server/configuration.h b/include/envoy/server/configuration.h index aee4ecf01c04..5fd0a37374f6 100644 --- a/include/envoy/server/configuration.h +++ b/include/envoy/server/configuration.h @@ -73,6 +73,13 @@ class Main { * for at least MultiKillTimeout before we kill the process. */ virtual double wdMultiKillThreshold() const PURE; + + /** + * @return Protobuf::RepeatedPtrField + * the WatchDog Actions that trigger on WatchDog Events. + */ + virtual Protobuf::RepeatedPtrField + wdActions() const PURE; }; /** diff --git a/include/envoy/server/guarddog_config.h b/include/envoy/server/guarddog_config.h new file mode 100644 index 000000000000..3f775d737379 --- /dev/null +++ b/include/envoy/server/guarddog_config.h @@ -0,0 +1,65 @@ +#pragma once + +#include + +#include "envoy/api/api.h" +#include "envoy/common/pure.h" +#include "envoy/config/bootstrap/v3/bootstrap.pb.h" +#include "envoy/config/typed_config.h" +#include "envoy/event/dispatcher.h" +#include "envoy/protobuf/message_validator.h" +#include "envoy/server/guarddog.h" + +#include "common/protobuf/protobuf.h" + +namespace Envoy { +namespace Server { +namespace Configuration { + +struct GuardDogActionFactoryContext { + Api::Api& api_; + Event::Dispatcher& dispatcher_; // not owned (this is the guard dog's dispatcher) +}; + +class GuardDogAction { +public: + virtual ~GuardDogAction() = default; + /** + * Callback function for when the GuardDog observes an event. + * @param event the event the GuardDog observes. + * @param thread_ltt_pairs pairs of the relevant thread to the event, and the + * last time touched (LTT) of those threads with their watchdog. + * @param now the current time. + */ + virtual void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event, + std::vector> thread_ltt_pairs, + MonotonicTime now) PURE; +}; + +using GuardDogActionPtr = std::unique_ptr; + +/** + * Implemented by each custom GuardDogAction and registered via Registry::registerFactory() + * or the convenience class RegisterFactory. + */ +class GuardDogActionFactory : public Config::TypedFactory { +public: + ~GuardDogActionFactory() override = default; + + /** + * Creates a particular GuardDog Action factory implementation. + * + * @param config supplies the configuration for the action. + * @param context supplies the GuardDog Action's context. + * @return GuardDogActionPtr the GuardDogAction object. + */ + virtual GuardDogActionPtr createGuardDogActionFromProto( + const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& config, + GuardDogActionFactoryContext& context) PURE; + + std::string category() const override { return "envoy.guarddog_actions"; } +}; + +} // namespace Configuration +} // namespace Server +} // namespace Envoy diff --git a/source/server/BUILD b/source/server/BUILD index 7bfcd7699576..c9351c86921e 100644 --- a/source/server/BUILD +++ b/source/server/BUILD @@ -114,6 +114,7 @@ envoy_cc_library( "//include/envoy/common:time_interface", "//include/envoy/event:timer_interface", "//include/envoy/server:configuration_interface", + "//include/envoy/server:guarddog_config_interface", "//include/envoy/server:guarddog_interface", "//include/envoy/server:watchdog_interface", "//include/envoy/stats:stats_interface", @@ -121,8 +122,11 @@ envoy_cc_library( "//source/common/common:assert_lib", "//source/common/common:minimal_logger_lib", "//source/common/common:thread_lib", + "//source/common/config:utility_lib", "//source/common/event:libevent_lib", + "//source/common/protobuf:utility_lib", "//source/common/stats:symbol_table_lib", + "@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto", ], ) diff --git a/source/server/configuration_impl.cc b/source/server/configuration_impl.cc index 7510f068f7ee..44f2a968221a 100644 --- a/source/server/configuration_impl.cc +++ b/source/server/configuration_impl.cc @@ -108,6 +108,7 @@ void MainImpl::initialize(const envoy::config::bootstrap::v3::Bootstrap& bootstr std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(watchdog, multikill_timeout, 0)); watchdog_multikill_threshold_ = PROTOBUF_PERCENT_TO_DOUBLE_OR_DEFAULT(watchdog, multikill_threshold, 0.0); + watchdog_actions_ = bootstrap.watchdog().actions(); initializeStatsSinks(bootstrap, server); } diff --git a/source/server/configuration_impl.h b/source/server/configuration_impl.h index d1c88000c1d1..2de17cff4d47 100644 --- a/source/server/configuration_impl.h +++ b/source/server/configuration_impl.h @@ -111,6 +111,10 @@ class MainImpl : Logger::Loggable, public Main { } double wdMultiKillThreshold() const override { return watchdog_multikill_threshold_; } + Protobuf::RepeatedPtrField + wdActions() const override { + return watchdog_actions_; + } private: /** @@ -129,6 +133,8 @@ class MainImpl : Logger::Loggable, public Main { std::chrono::milliseconds watchdog_kill_timeout_; std::chrono::milliseconds watchdog_multikill_timeout_; double watchdog_multikill_threshold_; + Protobuf::RepeatedPtrField + watchdog_actions_; }; /** diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index add9ca270d51..cce74e493d9e 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -4,12 +4,21 @@ #include #include +#include +#include +#include "envoy/common/time.h" +#include "envoy/config/bootstrap/v3/bootstrap.pb.h" +#include "envoy/server/guarddog.h" +#include "envoy/server/guarddog_config.h" #include "envoy/stats/scope.h" #include "common/common/assert.h" #include "common/common/fmt.h" #include "common/common/lock_guard.h" +#include "common/common/logger.h" +#include "common/config/utility.h" +#include "common/protobuf/utility.h" #include "common/stats/symbol_table_impl.h" #include "server/watchdog_impl.h" @@ -42,7 +51,25 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio Stats::StatNameManagedStorage("server.watchdog_mega_miss", stats_scope.symbolTable()) .statName())), dispatcher_(api.allocateDispatcher("guarddog_thread")), - loop_timer_(dispatcher_->createTimer([this]() { step(); })), run_thread_(true) { + loop_timer_(dispatcher_->createTimer([this]() { step(); })), + events_to_actions_([&](const Server::Configuration::Main& config) -> EventToActionsMap { + EventToActionsMap map; + + // We should be able to share the dispatcher since guard dog's lifetime + // should eclipse those of actions. + Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_}; + + const auto& actions = config.wdActions(); + for (const auto& action : actions) { + // Get factory and add the created cb + auto& factory = Config::Utility::getAndCheckFactory( + action.config()); + map[action.event()].push_back(factory.createGuardDogActionFromProto(action, context)); + } + + return map; + }(config)), + run_thread_(true) { start(api); } @@ -61,9 +88,11 @@ void GuardDogImpl::step() { } const auto now = time_source_.monotonicTime(); + std::vector> miss_threads; + std::vector> mega_miss_threads; { - size_t multi_kill_count = 0; + std::vector> multi_kill_threads; Thread::LockGuard guard(wd_lock_); // Compute the multikill threshold @@ -73,6 +102,7 @@ void GuardDogImpl::step() { for (auto& watched_dog : watched_dogs_) { const auto ltt = watched_dog->dog_->lastTouchTime(); + const auto tid = watched_dog->dog_->threadId(); const auto delta = now - ltt; if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < ltt) { watched_dog->miss_alerted_ = false; @@ -84,6 +114,7 @@ void GuardDogImpl::step() { watched_dog->miss_counter_.inc(); watched_dog->last_alert_time_ = ltt; watched_dog->miss_alerted_ = true; + miss_threads.emplace_back(tid, ltt); } } if (delta > megamiss_timeout_) { @@ -92,22 +123,38 @@ void GuardDogImpl::step() { watched_dog->megamiss_counter_.inc(); watched_dog->last_alert_time_ = ltt; watched_dog->megamiss_alerted_ = true; + mega_miss_threads.emplace_back(tid, ltt); } } if (killEnabled() && delta > kill_timeout_) { + invokeGuardDogActions(WatchDogAction::KILL, {{tid, ltt}}, now); + PANIC(fmt::format("GuardDog: one thread ({}) stuck for more than watchdog_kill_timeout", watched_dog->dog_->threadId().debugString())); } if (multikillEnabled() && delta > multi_kill_timeout_) { - if (++multi_kill_count >= required_for_multi_kill) { + multi_kill_threads.emplace_back(tid, ltt); + + if (multi_kill_threads.size() >= required_for_multi_kill) { + invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now); + PANIC(fmt::format("GuardDog: At least {} threads ({},...) stuck for more than " "watchdog_multikill_timeout", - multi_kill_count, watched_dog->dog_->threadId().debugString())); + multi_kill_threads.size(), tid.debugString())); } } } } + // Run megamiss and miss handlers + if (!mega_miss_threads.empty()) { + invokeGuardDogActions(WatchDogAction::MEGAMISS, mega_miss_threads, now); + } + + if (!miss_threads.empty()) { + invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now); + } + { Thread::LockGuard guard(mutex_); test_interlock_hook_->signalFromImpl(now); @@ -167,6 +214,17 @@ void GuardDogImpl::stop() { } } +void GuardDogImpl::invokeGuardDogActions( + WatchDogAction::WatchdogEvent event, + std::vector> thread_ltt_pairs, MonotonicTime now) { + const auto& registered_actions = events_to_actions_.find(event); + if (registered_actions != events_to_actions_.end()) { + for (auto& action : registered_actions->second) { + action->run(event, thread_ltt_pairs, now); + } + } +} + GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, const WatchDogSharedPtr& watch_dog) : dog_(watch_dog), diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index 2fba7f0edcbb..b854215bbbd9 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -4,9 +4,11 @@ #include #include "envoy/api/api.h" +#include "envoy/config/bootstrap/v3/bootstrap.pb.h" #include "envoy/event/timer.h" #include "envoy/server/configuration.h" #include "envoy/server/guarddog.h" +#include "envoy/server/guarddog_config.h" #include "envoy/server/watchdog.h" #include "envoy/stats/scope.h" #include "envoy/stats/stats.h" @@ -100,6 +102,13 @@ class GuardDogImpl : public GuardDog { bool killEnabled() const { return kill_timeout_ > std::chrono::milliseconds(0); } bool multikillEnabled() const { return multi_kill_timeout_ > std::chrono::milliseconds(0); } + using WatchDogAction = envoy::config::bootstrap::v3::Watchdog::WatchdogAction; + // Helper function to invoke all the GuardDogActions registered for an Event. + void + invokeGuardDogActions(WatchDogAction::WatchdogEvent event, + std::vector> thread_ltt_pairs, + MonotonicTime now); + struct WatchedDog { WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, const WatchDogSharedPtr& watch_dog); @@ -129,6 +138,9 @@ class GuardDogImpl : public GuardDog { Thread::ThreadPtr thread_; Event::DispatcherPtr dispatcher_; Event::TimerPtr loop_timer_; + using EventToActionsMap = absl::flat_hash_map>; + EventToActionsMap events_to_actions_; Thread::MutexBasicLockable mutex_; bool run_thread_ ABSL_GUARDED_BY(mutex_); }; diff --git a/test/mocks/server/BUILD b/test/mocks/server/BUILD index 35a1ac14b80a..cee8e3a58fb5 100644 --- a/test/mocks/server/BUILD +++ b/test/mocks/server/BUILD @@ -197,6 +197,7 @@ envoy_cc_mock( deps = [ "//include/envoy/server:configuration_interface", "//include/envoy/server:overload_manager_interface", + "//test/test_common:utility_lib", ], ) diff --git a/test/mocks/server/main.cc b/test/mocks/server/main.cc index 26bde5941bed..0b6c739feaad 100644 --- a/test/mocks/server/main.cc +++ b/test/mocks/server/main.cc @@ -1,5 +1,7 @@ #include "main.h" +#include "test/test_common/utility.h" + #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -10,14 +12,25 @@ namespace Configuration { using ::testing::Return; MockMain::MockMain(int wd_miss, int wd_megamiss, int wd_kill, int wd_multikill, - double wd_multikill_threshold) + double wd_multikill_threshold, const std::vector wd_action_protos) : wd_miss_(wd_miss), wd_megamiss_(wd_megamiss), wd_kill_(wd_kill), wd_multikill_(wd_multikill), - wd_multikill_threshold_(wd_multikill_threshold) { + wd_multikill_threshold_(wd_multikill_threshold), wd_actions_([&]() { + Protobuf::RepeatedPtrField actions; + + for (const auto& action_proto_str : wd_action_protos) { + envoy::config::bootstrap::v3::Watchdog::WatchdogAction action; + TestUtility::loadFromJson(action_proto_str, action); + actions.Add()->CopyFrom(action); + } + + return actions; + }()) { ON_CALL(*this, wdMissTimeout()).WillByDefault(Return(wd_miss_)); ON_CALL(*this, wdMegaMissTimeout()).WillByDefault(Return(wd_megamiss_)); ON_CALL(*this, wdKillTimeout()).WillByDefault(Return(wd_kill_)); ON_CALL(*this, wdMultiKillTimeout()).WillByDefault(Return(wd_multikill_)); ON_CALL(*this, wdMultiKillThreshold()).WillByDefault(Return(wd_multikill_threshold_)); + ON_CALL(*this, wdActions).WillByDefault(Return(wd_actions_)); } MockMain::~MockMain() = default; diff --git a/test/mocks/server/main.h b/test/mocks/server/main.h index c89b637e669a..91d2ee19b38e 100644 --- a/test/mocks/server/main.h +++ b/test/mocks/server/main.h @@ -15,9 +15,9 @@ namespace Server { namespace Configuration { class MockMain : public Main { public: - MockMain() : MockMain(0, 0, 0, 0, 0.0) {} + MockMain() : MockMain(0, 0, 0, 0, 0.0, {}) {} MockMain(int wd_miss, int wd_megamiss, int wd_kill, int wd_multikill, - double wd_multikill_threshold); + double wd_multikill_threshold, const std::vector wd_action_protos); ~MockMain() override; MOCK_METHOD(Upstream::ClusterManager*, clusterManager, ()); @@ -28,12 +28,15 @@ class MockMain : public Main { MOCK_METHOD(std::chrono::milliseconds, wdKillTimeout, (), (const)); MOCK_METHOD(std::chrono::milliseconds, wdMultiKillTimeout, (), (const)); MOCK_METHOD(double, wdMultiKillThreshold, (), (const)); + MOCK_METHOD(Protobuf::RepeatedPtrField, + wdActions, (), (const)); std::chrono::milliseconds wd_miss_; std::chrono::milliseconds wd_megamiss_; std::chrono::milliseconds wd_kill_; std::chrono::milliseconds wd_multikill_; double wd_multikill_threshold_; + Protobuf::RepeatedPtrField wd_actions_; }; } // namespace Configuration } // namespace Server diff --git a/test/server/BUILD b/test/server/BUILD index 4718ce6669fb..eb11d33ce7c3 100644 --- a/test/server/BUILD +++ b/test/server/BUILD @@ -136,6 +136,7 @@ envoy_cc_test( "//test/mocks:common_lib", "//test/mocks/server:main_mocks", "//test/mocks/stats:stats_mocks", + "//test/test_common:registry_lib", "//test/test_common:simulated_time_system_lib", "//test/test_common:utility_lib", ], diff --git a/test/server/guarddog_impl_test.cc b/test/server/guarddog_impl_test.cc index e26856e011db..c7c090d3faf9 100644 --- a/test/server/guarddog_impl_test.cc +++ b/test/server/guarddog_impl_test.cc @@ -1,19 +1,24 @@ #include #include +#include #include +#include #include "envoy/common/time.h" +#include "envoy/server/guarddog_config.h" #include "envoy/server/watchdog.h" #include "common/api/api_impl.h" #include "common/common/macros.h" #include "common/common/utility.h" +#include "common/protobuf/utility.h" #include "server/guarddog_impl.h" #include "test/mocks/common.h" #include "test/mocks/server/main.h" #include "test/mocks/stats/mocks.h" +#include "test/test_common/registry.h" #include "test/test_common/simulated_time_system.h" #include "test/test_common/test_time.h" #include "test/test_common/utility.h" @@ -21,6 +26,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +using testing::ElementsAre; using testing::InSequence; using testing::NiceMock; @@ -28,6 +34,15 @@ namespace Envoy { namespace Server { namespace { +// Kill has an explicit value that disables the feature. +const int DISABLE_KILL = 0; +const int DISABLE_MULTIKILL = 0; + +// Miss / Megamiss don't have an explicit value that disables them +// so set a timeout larger than those used in tests for 'disable' it. +const int DISABLE_MISS = 1000000; +const int DISABLE_MEGAMISS = 1000000; + class DebugTestInterlock : public GuardDogImpl::TestInterlockHook { public: // GuardDogImpl::TestInterlockHook @@ -90,8 +105,9 @@ INSTANTIATE_TEST_SUITE_P(TimeSystemType, GuardDogTestBase, class GuardDogDeathTest : public GuardDogTestBase { protected: GuardDogDeathTest() - : config_kill_(1000, 1000, 100, 1000, 0), config_multikill_(1000, 1000, 1000, 500, 0), - config_multikill_threshold_(1000, 1000, 1000, 500, 60) {} + : config_kill_(1000, 1000, 100, 1000, 0, std::vector{}), + config_multikill_(1000, 1000, 1000, 500, 0, std::vector{}), + config_multikill_threshold_(1000, 1000, 1000, 500, 60, std::vector{}) {} /** * This does everything but the final forceCheckForTest() that should cause @@ -255,7 +271,9 @@ TEST_P(GuardDogAlmostDeadTest, NearDeathTest) { class GuardDogMissTest : public GuardDogTestBase { protected: - GuardDogMissTest() : config_miss_(500, 1000, 0, 0, 0), config_mega_(1000, 500, 0, 0, 0) {} + GuardDogMissTest() + : config_miss_(500, 1000, 0, 0, 0, std::vector{}), + config_mega_(1000, 500, 0, 0, 0, std::vector{}) {} void checkMiss(uint64_t count, const std::string& descriptor) { EXPECT_EQ(count, TestUtility::findCounter(stats_store_, "server.watchdog_miss")->value()) @@ -375,27 +393,27 @@ TEST_P(GuardDogMissTest, MissCountTest) { TEST_P(GuardDogTestBase, StartStopTest) { NiceMock stats; - NiceMock config(0, 0, 0, 0, 0); + NiceMock config(0, 0, 0, 0, 0, std::vector{}); initGuardDog(stats, config); } TEST_P(GuardDogTestBase, LoopIntervalNoKillTest) { NiceMock stats; - NiceMock config(40, 50, 0, 0, 0); + NiceMock config(40, 50, 0, 0, 0, std::vector{}); initGuardDog(stats, config); EXPECT_EQ(guard_dog_->loopIntervalForTest(), std::chrono::milliseconds(40)); } TEST_P(GuardDogTestBase, LoopIntervalTest) { NiceMock stats; - NiceMock config(100, 90, 1000, 500, 0); + NiceMock config(100, 90, 1000, 500, 0, std::vector{}); initGuardDog(stats, config); EXPECT_EQ(guard_dog_->loopIntervalForTest(), std::chrono::milliseconds(90)); } TEST_P(GuardDogTestBase, WatchDogThreadIdTest) { NiceMock stats; - NiceMock config(100, 90, 1000, 500, 0); + NiceMock config(100, 90, 1000, 500, 0, std::vector{}); initGuardDog(stats, config); auto watched_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); @@ -415,6 +433,363 @@ TEST_P(GuardDogTestBase, AtomicIsAtomicTest) { ASSERT_EQ(atomic_time.is_lock_free(), true); } +// A GuardDogAction used for testing the GuardDog. +// It's primary use is dumping string of the format EVENT_TYPE : tid1,.., tidN to +// the events vector passed to it. +// Instances of this class will be registered for GuardDogEvent through +// TestGuardDogActionFactory. +class RecordGuardDogAction : public Configuration::GuardDogAction { +public: + RecordGuardDogAction(std::vector& events) : events_(events) {} + + void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event, + std::vector> thread_ltt_pairs, + MonotonicTime /*now*/) override { + std::string event_string = + envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent_Name(event); + absl::StrAppend(&event_string, " : "); + std::vector output_string_parts; + output_string_parts.reserve(thread_ltt_pairs.size()); + + for (const auto& thread_ltt_pair : thread_ltt_pairs) { + output_string_parts.push_back(thread_ltt_pair.first.debugString()); + } + + absl::StrAppend(&event_string, absl::StrJoin(output_string_parts, ",")); + events_.push_back(event_string); + } + +protected: + std::vector& events_; // not owned +}; + +// A GuardDogAction that raises the specified signal. +class AssertGuardDogAction : public Configuration::GuardDogAction { +public: + AssertGuardDogAction() = default; + + void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/, + std::vector> /*thread_ltt_pairs*/, + MonotonicTime /*now*/) override { + RELEASE_ASSERT(false, "ASSERT_GUARDDOG_ACTION"); + } +}; + +// Test factory for consuming Watchdog configs and creating GuardDogActions. +template +class RecordGuardDogActionFactory : public Configuration::GuardDogActionFactory { +public: + RecordGuardDogActionFactory(const std::string& name, std::vector& events) + : name_(name), events_(events) {} + + Configuration::GuardDogActionPtr createGuardDogActionFromProto( + const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& /*config*/, + Configuration::GuardDogActionFactoryContext& /*context*/) override { + // Return different actions depending on the config. + return std::make_unique(events_); + } + + ProtobufTypes::MessagePtr createEmptyConfigProto() override { + return ProtobufTypes::MessagePtr{new ConfigType()}; + } + + std::string name() const override { return name_; } + + const std::string name_; + std::vector& events_; // not owned +}; + +// Test factory for consuming Watchdog configs and creating GuardDogActions. +template +class AssertGuardDogActionFactory : public Configuration::GuardDogActionFactory { +public: + AssertGuardDogActionFactory(const std::string& name) : name_(name) {} + + Configuration::GuardDogActionPtr createGuardDogActionFromProto( + const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& /*config*/, + Configuration::GuardDogActionFactoryContext& /*context*/) override { + // Return different actions depending on the config. + return std::make_unique(); + } + + ProtobufTypes::MessagePtr createEmptyConfigProto() override { + return ProtobufTypes::MessagePtr{new ConfigType()}; + } + + std::string name() const override { return name_; } + + const std::string name_; +}; + +/** + * Tests that various actions registered for the guard dog get called upon. + */ +class GuardDogActionsTest : public GuardDogTestBase { +protected: + GuardDogActionsTest() + : log_factory_("LogFactory", events_), register_log_factory_(log_factory_), + assert_factory_("AssertFactory"), register_assert_factory_(assert_factory_) {} + + std::vector getActionsConfig() { + return { + R"EOF( + { + "config": { + "name": "AssertFactory", + "typed_config": { + "@type": "type.googleapis.com/google.protobuf.Empty" + } + }, + "event": "MULTIKILL" + } + )EOF", + R"EOF( + { + "config": { + "name": "AssertFactory", + "typed_config": { + "@type": "type.googleapis.com/google.protobuf.Empty" + } + }, + "event": "KILL" + } + )EOF", + R"EOF( + { + "config": { + "name": "LogFactory", + "typed_config": { + "@type": "type.googleapis.com/google.protobuf.Empty" + } + }, + "event": "MEGAMISS" + } + )EOF", + R"EOF( + { + "config": { + "name": "LogFactory", + "typed_config": { + "@type": "type.googleapis.com/google.protobuf.Empty" + } + }, + "event": "MISS" + } + )EOF"}; + } + + void setupFirstDog(const NiceMock& config) { + initGuardDog(fake_stats_, config); + first_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); + } + + std::vector actions_; + std::vector events_; + RecordGuardDogActionFactory log_factory_; + Registry::InjectFactory register_log_factory_; + AssertGuardDogActionFactory assert_factory_; + Registry::InjectFactory register_assert_factory_; + NiceMock fake_stats_; + WatchDogSharedPtr first_dog_; + WatchDogSharedPtr second_dog_; +}; + +INSTANTIATE_TEST_SUITE_P(TimeSystemType, GuardDogActionsTest, + testing::ValuesIn({TimeSystemType::Real, TimeSystemType::Simulated})); + +TEST_P(GuardDogActionsTest, MissShouldOnlyReportRelevantThreads) { + const NiceMock config(100, DISABLE_MEGAMISS, DISABLE_KILL, + DISABLE_MULTIKILL, 0, getActionsConfig()); + setupFirstDog(config); + second_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + time_system_->advanceTimeWait(std::chrono::milliseconds(99)); + second_dog_->touch(); + + time_system_->advanceTimeWait(std::chrono::milliseconds(2)); + guard_dog_->forceCheckForTest(); + + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MISS : ", api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, MissShouldBeAbleToReportMultipleThreads) { + const NiceMock config(100, DISABLE_MEGAMISS, DISABLE_KILL, + DISABLE_MULTIKILL, 0, getActionsConfig()); + initGuardDog(fake_stats_, config); + first_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + second_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + + first_dog_->touch(); + second_dog_->touch(); + + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MISS : ", api_->threadFactory().currentThreadId().debugString(), ",", + api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, MissShouldSaturateOnMissEvent) { + const NiceMock config(100, DISABLE_MISS, DISABLE_KILL, DISABLE_MULTIKILL, + 0, getActionsConfig()); + setupFirstDog(config); + + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MISS : ", api_->threadFactory().currentThreadId().debugString()))); + + // Should saturate and not add an additional "event_" + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MISS : ", api_->threadFactory().currentThreadId().debugString()))); + + // Touch the watchdog, which should allow the event to trigger again. + first_dog_->touch(); + + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT( + events_, + ElementsAre(absl::StrCat("MISS : ", api_->threadFactory().currentThreadId().debugString()), + absl::StrCat("MISS : ", api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, MegaMissShouldOnlyReportRelevantThreads) { + const NiceMock config(DISABLE_MISS, 100, DISABLE_KILL, DISABLE_MULTIKILL, + 0, getActionsConfig()); + setupFirstDog(config); + second_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + time_system_->advanceTimeWait(std::chrono::milliseconds(99)); + second_dog_->touch(); + + time_system_->advanceTimeWait(std::chrono::milliseconds(2)); + guard_dog_->forceCheckForTest(); + + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MEGAMISS : ", api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, MegaMissShouldBeAbleToReportMultipleThreads) { + const NiceMock config(DISABLE_MISS, 100, DISABLE_KILL, DISABLE_MULTIKILL, + 0, getActionsConfig()); + initGuardDog(fake_stats_, config); + first_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + second_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + + first_dog_->touch(); + second_dog_->touch(); + + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MEGAMISS : ", api_->threadFactory().currentThreadId().debugString(), + ",", api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, MegaMissShouldSaturateOnMegaMissEvent) { + const NiceMock config(DISABLE_MISS, 100, DISABLE_KILL, DISABLE_MULTIKILL, + 0, getActionsConfig()); + setupFirstDog(config); + + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MEGAMISS : ", api_->threadFactory().currentThreadId().debugString()))); + + // Should saturate and not add an additional "event_" + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT(events_, ElementsAre(absl::StrCat( + "MEGAMISS : ", api_->threadFactory().currentThreadId().debugString()))); + + // Touch the watchdog, which should allow the event to trigger again. + first_dog_->touch(); + + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT( + events_, + ElementsAre( + absl::StrCat("MEGAMISS : ", api_->threadFactory().currentThreadId().debugString()), + absl::StrCat("MEGAMISS : ", api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) { + // Priority of events are KILL, MULTIKILL, MEGAMISS and MISS + + // Kill event should fire before the others + auto kill_function = [&]() -> void { + const NiceMock config(100, 100, 100, 100, 0, getActionsConfig()); + initGuardDog(fake_stats_, config); + auto first_dog = + guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + auto second_dog = + guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + }; + + // We expect only the kill action to have fired + EXPECT_DEATH(kill_function(), "ASSERT_GUARDDOG_ACTION"); + + // Multikill event should fire before the others + auto multikill_function = [&]() -> void { + const NiceMock config(100, 100, DISABLE_KILL, 100, 0, + getActionsConfig()); + initGuardDog(fake_stats_, config); + auto first_dog = + guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + auto second_dog = + guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + }; + + EXPECT_DEATH(multikill_function(), "ASSERT_GUARDDOG_ACTION"); + + // We expect megamiss to fire before miss + const NiceMock config(100, 100, DISABLE_KILL, DISABLE_MULTIKILL, 0, + getActionsConfig()); + setupFirstDog(config); + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + EXPECT_THAT( + events_, + ElementsAre( + absl::StrCat("MEGAMISS : ", api_->threadFactory().currentThreadId().debugString()), + absl::StrCat("MISS : ", api_->threadFactory().currentThreadId().debugString()))); +} + +TEST_P(GuardDogActionsTest, KillShouldTriggerGuardDogActions) { + auto die_function = [&]() -> void { + const NiceMock config(DISABLE_MISS, DISABLE_MEGAMISS, 100, 0, 0, + getActionsConfig()); + setupFirstDog(config); + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + }; + + EXPECT_DEATH(die_function(), "ASSERT_GUARDDOG_ACTION"); +} + +TEST_P(GuardDogActionsTest, MultikillShouldTriggerGuardDogActions) { + auto die_function = [&]() -> void { + const NiceMock config(DISABLE_MISS, DISABLE_MEGAMISS, DISABLE_KILL, + 100, 0, getActionsConfig()); + setupFirstDog(config); + second_dog_ = + guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); + time_system_->advanceTimeWait(std::chrono::milliseconds(101)); + guard_dog_->forceCheckForTest(); + }; + + EXPECT_DEATH(die_function(), "ASSERT_GUARDDOG_ACTION"); +} + } // namespace } // namespace Server } // namespace Envoy diff --git a/tools/spelling/spelling_dictionary.txt b/tools/spelling/spelling_dictionary.txt index 1d74fa10a296..1caf69b4da80 100644 --- a/tools/spelling/spelling_dictionary.txt +++ b/tools/spelling/spelling_dictionary.txt @@ -26,6 +26,7 @@ CAS CB CDS CEL +LTT ceil CHACHA CHLO