From 437d222385d1d31d8bb19a9e8e2f5989bd404695 Mon Sep 17 00:00:00 2001 From: Stephan Zuercher Date: Wed, 14 Aug 2019 17:38:09 -0700 Subject: [PATCH 1/2] access_log: minimal log file error handling Rather than ASSERT for a reasonably common error condition (e.g. disk full) record a stat that indicates log file writing failed. Also fixes a test race condition. Risk Level: low Testing: added stats checks Docs Changes: documented new stat Release Notes: updated Signed-off-by: Stephan Zuercher --- docs/root/configuration/statistics.rst | 7 +- docs/root/intro/version_history.rst | 1 + .../access_log/access_log_manager_impl.cc | 8 +- .../access_log/access_log_manager_impl.h | 7 +- .../access_log_manager_impl_test.cc | 96 +++++++++++++++++-- test/exe/main_common_test.cc | 8 +- 6 files changed, 107 insertions(+), 20 deletions(-) diff --git a/docs/root/configuration/statistics.rst b/docs/root/configuration/statistics.rst index 1e79f39a0bf9..1ff0f8223fd8 100644 --- a/docs/root/configuration/statistics.rst +++ b/docs/root/configuration/statistics.rst @@ -14,7 +14,7 @@ Server related statistics are rooted at *server.* with following statistics: uptime, Gauge, Current server uptime in seconds concurrency, Gauge, Number of worker threads - memory_allocated, Gauge, Current amount of allocated memory in bytes. Total of both new and old Envoy processes on hot restart. + memory_allocated, Gauge, Current amount of allocated memory in bytes. Total of both new and old Envoy processes on hot restart. memory_heap_size, Gauge, Current reserved heap size in bytes. New Envoy process heap size on hot restart. live, Gauge, "1 if the server is not currently draining, 0 otherwise" state, Gauge, Current :ref:`State ` of the Server. @@ -26,6 +26,8 @@ Server related statistics are rooted at *server.* with following statistics: initialization_time_ms, Histogram, Total time taken for Envoy initialization in milliseconds. This is the time from server start-up until the worker threads are ready to accept new connections debug_assertion_failures, Counter, Number of debug assertion failures detected in a release build if compiled with `--define log_debug_assert_in_release=enabled` or zero otherwise +.. _filesystem_stats: + File system ----------- @@ -36,7 +38,8 @@ Statistics related to file system are emitted in the *filesystem.* namespace. :widths: 1, 1, 2 write_buffered, Counter, Total number of times file data is moved to Envoy's internal flush buffer - write_completed, Counter, Total number of times a file was written + write_completed, Counter, Total number of times a file was successfully written + write_failed, Counter, Total number of times an error occurred during a file write operation flushed_by_timer, Counter, Total number of times internal flush buffers are written to a file due to flush timeout reopen_failed, Counter, Total number of times a file was failed to be opened write_total_buffered, Gauge, Current total size of internal flush buffer in bytes diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index b7ac02962784..ed41eb8feb51 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -4,6 +4,7 @@ Version history 1.12.0 (pending) ================ * access log: added :ref:`buffering ` and :ref:`periodical flushing ` support to gRPC access logger. Defaults to 16KB buffer and flushing every 1 second. +* access log: reintroduce :ref:`filesystem ` stats and added the `write_failed` counter to track failed log writes * admin: added ability to configure listener :ref:`socket options `. * admin: added config dump support for Secret Discovery Service :ref:`SecretConfigDump `. * api: added ::ref:`set_node_on_first_message_only ` option to omit the node identifier from the subsequent discovery requests on the same stream. diff --git a/source/common/access_log/access_log_manager_impl.cc b/source/common/access_log/access_log_manager_impl.cc index dd2615421db1..8a5b1b8ac2de 100644 --- a/source/common/access_log/access_log_manager_impl.cc +++ b/source/common/access_log/access_log_manager_impl.cc @@ -100,8 +100,12 @@ void AccessLogFileImpl::doWrite(Buffer::Instance& buffer) { for (const Buffer::RawSlice& slice : slices) { absl::string_view data(static_cast(slice.mem_), slice.len_); const Api::IoCallSizeResult result = file_->write(data); - ASSERT(result.rc_ == static_cast(slice.len_)); - stats_.write_completed_.inc(); + if (result.ok() && result.rc_ == static_cast(slice.len_)) { + stats_.write_completed_.inc(); + } else { + // Probably disk full. + stats_.write_failed_.inc(); + } } } diff --git a/source/common/access_log/access_log_manager_impl.h b/source/common/access_log/access_log_manager_impl.h index 03efbf2d030b..64c9c2594b49 100644 --- a/source/common/access_log/access_log_manager_impl.h +++ b/source/common/access_log/access_log_manager_impl.h @@ -20,6 +20,7 @@ namespace Envoy { COUNTER(reopen_failed) \ COUNTER(write_buffered) \ COUNTER(write_completed) \ + COUNTER(write_failed) \ GAUGE(write_total_buffered, Accumulate) struct AccessLogFileStats { @@ -34,9 +35,9 @@ class AccessLogManagerImpl : public AccessLogManager { Event::Dispatcher& dispatcher, Thread::BasicLockable& lock, Stats::Store& stats_store) : file_flush_interval_msec_(file_flush_interval_msec), api_(api), dispatcher_(dispatcher), - lock_(lock), file_stats_{ACCESS_LOG_FILE_STATS( - POOL_COUNTER_PREFIX(stats_store, "access_log_file."), - POOL_GAUGE_PREFIX(stats_store, "access_log_file."))} {} + lock_(lock), file_stats_{ + ACCESS_LOG_FILE_STATS(POOL_COUNTER_PREFIX(stats_store, "filesystem."), + POOL_GAUGE_PREFIX(stats_store, "filesystem."))} {} // AccessLog::AccessLogManager void reopen() override; diff --git a/test/common/access_log/access_log_manager_impl_test.cc b/test/common/access_log/access_log_manager_impl_test.cc index 4681e9453a7c..aa9300d3690a 100644 --- a/test/common/access_log/access_log_manager_impl_test.cc +++ b/test/common/access_log/access_log_manager_impl_test.cc @@ -8,6 +8,8 @@ #include "test/mocks/api/mocks.h" #include "test/mocks/event/mocks.h" #include "test/mocks/filesystem/mocks.h" +#include "test/test_common/test_time.h" +#include "test/test_common/utility.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -36,6 +38,14 @@ class AccessLogManagerImplTest : public testing::Test { EXPECT_CALL(api_, threadFactory()).WillRepeatedly(ReturnRef(thread_factory_)); } + void waitForCounterEq(const std::string& name, uint64_t value) { + TestUtility::waitForCounterEq(store_, name, value, time_system_); + } + + void waitForGaugeEq(const std::string& name, uint64_t value) { + TestUtility::waitForGaugeEq(store_, name, value, time_system_); + } + NiceMock api_; NiceMock file_system_; NiceMock* file_; @@ -45,6 +55,7 @@ class AccessLogManagerImplTest : public testing::Test { NiceMock dispatcher_; Thread::MutexBasicLockable lock_; AccessLogManagerImpl access_log_manager_; + Event::TestRealTimeSystem time_system_; }; TEST_F(AccessLogManagerImplTest, BadFile) { @@ -59,9 +70,18 @@ TEST_F(AccessLogManagerImplTest, flushToLogFilePeriodically) { EXPECT_CALL(*file_, open_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); AccessLogFileSharedPtr log_file = access_log_manager_.createAccessLog("foo"); + EXPECT_EQ(0UL, store_.counter("filesystem.write_failed").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.write_completed").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.write_buffered").value()); + EXPECT_CALL(*timer, enableTimer(timeout_40ms_)); EXPECT_CALL(*file_, write_(_)) - .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { + .WillOnce(Invoke([&](absl::string_view data) -> Api::IoCallSizeResult { + EXPECT_EQ( + 4UL, + store_.gauge("filesystem.write_total_buffered", Stats::Gauge::ImportMode::Accumulate) + .value()); EXPECT_EQ(0, data.compare("test")); return Filesystem::resultSuccess(static_cast(data.length())); })); @@ -75,14 +95,25 @@ TEST_F(AccessLogManagerImplTest, flushToLogFilePeriodically) { } } + waitForCounterEq("filesystem.write_completed", 1); + EXPECT_EQ(1UL, store_.counter("filesystem.write_buffered").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + waitForGaugeEq("filesystem.write_total_buffered", 0); + EXPECT_CALL(*file_, write_(_)) - .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { + .WillOnce(Invoke([&](absl::string_view data) -> Api::IoCallSizeResult { + EXPECT_EQ( + 5UL, + store_.gauge("filesystem.write_total_buffered", Stats::Gauge::ImportMode::Accumulate) + .value()); EXPECT_EQ(0, data.compare("test2")); return Filesystem::resultSuccess(static_cast(data.length())); })); - // make sure timer is re-enabled on callback call log_file->write("test2"); + EXPECT_EQ(2UL, store_.counter("filesystem.write_buffered").value()); + + // make sure timer is re-enabled on callback call EXPECT_CALL(*timer, enableTimer(timeout_40ms_)); timer->callback_(); @@ -92,6 +123,13 @@ TEST_F(AccessLogManagerImplTest, flushToLogFilePeriodically) { file_->write_event_.wait(file_->write_mutex_); } } + + waitForCounterEq("filesystem.write_completed", 2); + EXPECT_EQ(0UL, store_.counter("filesystem.write_failed").value()); + EXPECT_EQ(1UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_EQ(2UL, store_.counter("filesystem.write_buffered").value()); + waitForGaugeEq("filesystem.write_total_buffered", 0); + EXPECT_CALL(*file_, close_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); } @@ -101,21 +139,24 @@ TEST_F(AccessLogManagerImplTest, flushToLogFileOnDemand) { EXPECT_CALL(*file_, open_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); AccessLogFileSharedPtr log_file = access_log_manager_.createAccessLog("foo"); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_CALL(*timer, enableTimer(timeout_40ms_)); - // The first write to a given file will start the flush thread, which can flush - // immediately (race on whether it will or not). So do a write and flush to - // get that state out of the way, then test that small writes don't trigger a flush. + // The first write to a given file will start the flush thread. Because AccessManagerImpl::write + // holds the write_lock_ when the thread is started, the thread will flush on its first loop, once + // it obtains the write_lock_. Perform a write to get all that out of the way. EXPECT_CALL(*file_, write_(_)) .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { return Filesystem::resultSuccess(static_cast(data.length())); })); log_file->write("prime-it"); - log_file->flush(); uint32_t expected_writes = 1; { Thread::LockGuard lock(file_->write_mutex_); - EXPECT_EQ(expected_writes, file_->num_writes_); + while (file_->num_writes_ != expected_writes) { + file_->write_event_.wait(file_->write_mutex_); + } } EXPECT_CALL(*file_, write_(_)) @@ -135,9 +176,14 @@ TEST_F(AccessLogManagerImplTest, flushToLogFileOnDemand) { expected_writes++; { Thread::LockGuard lock(file_->write_mutex_); - EXPECT_EQ(expected_writes, file_->num_writes_); + while (file_->num_writes_ != expected_writes) { + file_->write_event_.wait(file_->write_mutex_); + } } + waitForCounterEq("filesystem.write_completed", 2); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_CALL(*file_, write_(_)) .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { EXPECT_EQ(0, data.compare("test2")); @@ -159,6 +205,36 @@ TEST_F(AccessLogManagerImplTest, flushToLogFileOnDemand) { EXPECT_CALL(*file_, close_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); } +TEST_F(AccessLogManagerImplTest, flushCountsIOErrors) { + NiceMock* timer = new NiceMock(&dispatcher_); + + EXPECT_CALL(*file_, open_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); + AccessLogFileSharedPtr log_file = access_log_manager_.createAccessLog("foo"); + + EXPECT_EQ(0UL, store_.counter("filesystem.write_failed").value()); + + EXPECT_CALL(*timer, enableTimer(timeout_40ms_)); + EXPECT_CALL(*file_, write_(_)) + .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { + EXPECT_EQ(0, data.compare("test")); + return Filesystem::resultFailure(2UL, ENOSPC); + })); + + log_file->write("test"); + + { + Thread::LockGuard lock(file_->write_mutex_); + while (file_->num_writes_ != 1) { + file_->write_event_.wait(file_->write_mutex_); + } + } + + waitForCounterEq("filesystem.write_failed", 1); + EXPECT_EQ(0UL, store_.counter("filesystem.write_completed").value()); + + EXPECT_CALL(*file_, close_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); +} + TEST_F(AccessLogManagerImplTest, reopenFile) { NiceMock* timer = new NiceMock(&dispatcher_); @@ -259,6 +335,8 @@ TEST_F(AccessLogManagerImplTest, reopenThrows) { // write call should not cause any exceptions log_file->write("random data"); timer->callback_(); + + waitForCounterEq("filesystem.reopen_failed", 1); } TEST_F(AccessLogManagerImplTest, bigDataChunkShouldBeFlushedWithoutTimer) { diff --git a/test/exe/main_common_test.cc b/test/exe/main_common_test.cc index 4dadcb31875f..7af5aee32c6a 100644 --- a/test/exe/main_common_test.cc +++ b/test/exe/main_common_test.cc @@ -240,7 +240,7 @@ class AdminRequestTest : public MainCommonTest { TEST_P(AdminRequestTest, AdminRequestGetStatsAndQuit) { startEnvoy(); started_.WaitForNotification(); - EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("filesystem.reopen_failed")); adminRequest("/quitquitquit", "POST"); EXPECT_TRUE(waitForEnvoyToExit()); } @@ -253,7 +253,7 @@ TEST_P(AdminRequestTest, AdminRequestGetStatsAndKill) { // TODO(htuch): Remove when https://github.com/libevent/libevent/issues/779 is // fixed, started_ will then become our real synchronization point. waitForEnvoyRun(); - EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("filesystem.reopen_failed")); kill(getpid(), SIGTERM); EXPECT_TRUE(waitForEnvoyToExit()); } @@ -266,7 +266,7 @@ TEST_P(AdminRequestTest, AdminRequestGetStatsAndCtrlC) { // TODO(htuch): Remove when https://github.com/libevent/libevent/issues/779 is // fixed, started_ will then become our real synchronization point. waitForEnvoyRun(); - EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("filesystem.reopen_failed")); kill(getpid(), SIGINT); EXPECT_TRUE(waitForEnvoyToExit()); } @@ -335,7 +335,7 @@ TEST_P(AdminRequestTest, AdminRequestBeforeRun) { EXPECT_TRUE(admin_handler_was_called); // This just checks that some stat output was reported. We could pick any stat. - EXPECT_THAT(out, HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(out, HasSubstr("filesystem.reopen_failed")); } // Class to track whether an object has been destroyed, which it does by bumping an atomic. From 76c432618fe08cbb3c27dfdbcc78f67b92b7ae1f Mon Sep 17 00:00:00 2001 From: Stephan Zuercher Date: Mon, 26 Aug 2019 19:44:27 -0700 Subject: [PATCH 2/2] fix sort order Signed-off-by: Stephan Zuercher --- docs/root/intro/version_history.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index c743b2e55951..5152e89fdc65 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -4,8 +4,8 @@ Version history 1.12.0 (pending) ================ * access log: added :ref:`buffering ` and :ref:`periodical flushing ` support to gRPC access logger. Defaults to 16KB buffer and flushing every 1 second. -* access log: reintroduce :ref:`filesystem ` stats and added the `write_failed` counter to track failed log writes * access log: gRPC Access Log Service (ALS) support added for :ref:`TCP access logs `. +* access log: reintroduce :ref:`filesystem ` stats and added the `write_failed` counter to track failed log writes * admin: added ability to configure listener :ref:`socket options `. * admin: added config dump support for Secret Discovery Service :ref:`SecretConfigDump `. * api: added ::ref:`set_node_on_first_message_only ` option to omit the node identifier from the subsequent discovery requests on the same stream.