Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a CPU utilization resource monitor for overload manager #34713

Merged
merged 22 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ extensions/filters/common/original_src @klarose @mattklein123
/*/extensions/resource_monitors/common @eziskind @yanavlasov @nezdolik
/*/extensions/resource_monitors/fixed_heap @eziskind @yanavlasov @nezdolik
/*/extensions/resource_monitors/downstream_connections @nezdolik @mattklein123
/*/extensions/resource_monitors/cpu_utilization @cancecen @kbaichoo
/*/extensions/retry/priority @alyssawilk @mattklein123
/*/extensions/retry/priority/previous_priorities @alyssawilk @mattklein123
/*/extensions/retry/host @alyssawilk @mattklein123
Expand Down
1 change: 1 addition & 0 deletions api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ proto_library(
"//envoy/extensions/rbac/matchers/upstream_ip_port/v3:pkg",
"//envoy/extensions/regex_engines/v3:pkg",
"//envoy/extensions/request_id/uuid/v3:pkg",
"//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg",
"//envoy/extensions/resource_monitors/downstream_connections/v3:pkg",
"//envoy/extensions/resource_monitors/fixed_heap/v3:pkg",
"//envoy/extensions/resource_monitors/injected_resource/v3:pkg",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.

load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")

licenses(["notice"]) # Apache 2

api_proto_package(
deps = ["@com_github_cncf_xds//udpa/annotations:pkg"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
syntax = "proto3";

package envoy.extensions.resource_monitors.cpu_utilization.v3;

import "udpa/annotations/status.proto";

option java_package = "io.envoyproxy.envoy.extensions.resource_monitors.cpu_utilization.v3";
option java_outer_classname = "CpuUtilizationProto";
option java_multiple_files = true;
option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/resource_monitors/cpu_utilization/v3;cpu_utilizationv3";
option (udpa.annotations.file_status).package_version_status = ACTIVE;

// [#protodoc-title: CPU utilization]
// [#extension: envoy.resource_monitors.cpu_utilization]

// The CPU utilization resource monitor reports the Envoy process the CPU Utilization of the entire host.
// Today, this only works on Linux and is calculated using the stats in the /proc/stat file.
message CpuUtilizationConfig {
}
1 change: 1 addition & 0 deletions api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ proto_library(
"//envoy/extensions/rbac/matchers/upstream_ip_port/v3:pkg",
"//envoy/extensions/regex_engines/v3:pkg",
"//envoy/extensions/request_id/uuid/v3:pkg",
"//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg",
"//envoy/extensions/resource_monitors/downstream_connections/v3:pkg",
"//envoy/extensions/resource_monitors/fixed_heap/v3:pkg",
"//envoy/extensions/resource_monitors/injected_resource/v3:pkg",
Expand Down
4 changes: 4 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -312,5 +312,9 @@ new_features:
change: |
The :ref:`xff <envoy_v3_api_msg_extensions.http.original_ip_detection.xff.v3.XffConfig>`
original IP detection method now supports using a list of trusted CIDRs when parsing ``x-forwarded-for``.
- area: resource_monitors
change: |
Added possibility to monitor CPU utilization in Linux based systems via :ref:`cpu utilization monitor
<envoy_v3_api_msg_extensions.resource_monitors.cpu_utilization.v3.CpuUtilizationConfig>` in overload manager.

deprecated:
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
static_resources:
listeners:
- address:
socket_address:
address: 0.0.0.0
port_value: 8000
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
'@type': type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: ingress_http
http_filters:
- name: envoy.filters.http.router
typed_config:
'@type': type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
route_config:
name: local_route
virtual_hosts:
- domains:
- '*'
name: local_service
routes:
- match: {prefix: "/"}
route: {cluster: default_service}
clusters:
- name: default_service
load_assignment:
cluster_name: default_service
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 10001
admin:
address:
socket_address:
address: 0.0.0.0
port_value: 9901

overload_manager:
refresh_interval: 0.25s
resource_monitors:
- name: "envoy.resource_monitors.cpu_utilization"
typed_config:
"@type": type.googleapis.com/envoy.extensions.resource_monitors.cpu_utilization.v3.CpuUtilizationConfig
actions:
- name: "envoy.overload_actions.stop_accepting_requests"
triggers:
- name: "envoy.resource_monitors.cpu_utilization"
scaled:
scaling_threshold: 0.80
saturation_threshold: 0.95
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,28 @@ It's expected that the first few gradations shouldn't trigger anything, unless
there's something seriously wrong e.g. in this example streams using ``>=
128MiB`` in buffers.

CPU Intensive Workload Brownout Protection
------------------------------------------

The ``envoy.overload_actions.stop_accepting_requests`` overload action can be used
to protect workloads from browning-out when an unexpected spike in the number of
requests the workload receives that causes the CPU to become saturated. This overload
action when used in conjunction with the ``envoy.resource_monitors.cpu_utilization``
resource monitor can reduce the pressure on the CPU by cheaply rejecting new requests.
While the real mitigation for such request spikes are horizantally scaling the workload,
this overload action can be used to ensure the fleet does not get into a cascading failure
mode.
Some platform owners may choose to install this overload action by default to protect the fleet,
since it is easier to configure a target CPU utilization percentage than to configure a request rate per
workload.

.. literalinclude:: _include/cpu_utilization_monitor_overload.yaml
:language: yaml
:lines: 43-55
:emphasize-lines: 3-13
:linenos:
:caption: :download:`cpu_utilization_monitor_overload.yaml <_include/cpu_utilization_monitor_overload.yaml>`


Statistics
----------
Expand Down Expand Up @@ -388,4 +410,3 @@ with the following statistics:

scale_percent, Gauge, "Scaled value of the action as a percent (0-99=scaling, 100=saturated)"
shed_load_count, Counter, "Total count the load is sheded"

1 change: 1 addition & 0 deletions source/extensions/extensions_build_config.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ EXTENSIONS = {
"envoy.resource_monitors.fixed_heap": "//source/extensions/resource_monitors/fixed_heap:config",
"envoy.resource_monitors.injected_resource": "//source/extensions/resource_monitors/injected_resource:config",
"envoy.resource_monitors.global_downstream_max_connections": "//source/extensions/resource_monitors/downstream_connections:config",
"envoy.resource_monitors.cpu_utilization": "//source/extensions/resource_monitors/cpu_utilization:config",

#
# Stat sinks
Expand Down
7 changes: 7 additions & 0 deletions source/extensions/extensions_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,13 @@ envoy.request_id.uuid:
status: stable
type_urls:
- envoy.extensions.request_id.uuid.v3.UuidRequestIdConfig
envoy.resource_monitors.cpu_utilization:
categories:
- envoy.resource_monitors
security_posture: data_plane_agnostic
status: alpha
type_urls:
- envoy.extensions.resource_monitors.cpu_utilization.v3.CpuUtilizationConfig
envoy.resource_monitors.global_downstream_max_connections:
categories:
- envoy.resource_monitors
Expand Down
59 changes: 59 additions & 0 deletions source/extensions/resource_monitors/cpu_utilization/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_extension",
"envoy_cc_library",
"envoy_extension_package",
)

licenses(["notice"]) # Apache 2

envoy_extension_package()

envoy_cc_library(
name = "cpu_utilization_monitor",
srcs = ["cpu_utilization_monitor.cc"],
hdrs = [
"cpu_stats_reader.h",
"cpu_utilization_monitor.h",
],
tags = ["skip_on_windows"],
deps = [
"//envoy/common:exception_lib",
"//envoy/server:resource_monitor_config_interface",
"//source/common/runtime:runtime_features_lib",
"@envoy_api//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg_cc_proto",
],
)

envoy_cc_library(
name = "linux_cpu_stats_reader",
srcs = ["linux_cpu_stats_reader.cc"],
hdrs = [
"cpu_stats_reader.h",
"cpu_utilization_monitor.h",
"linux_cpu_stats_reader.h",
],
tags = ["skip_on_windows"],
deps = [
"//source/common/common:logger_lib",
"@com_google_absl//absl/strings",
"@envoy_api//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg_cc_proto",
],
)

envoy_cc_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
tags = ["skip_on_windows"],
deps = [
":cpu_utilization_monitor",
":linux_cpu_stats_reader",
"//envoy/registry",
"//envoy/server:resource_monitor_config_interface",
"//source/common/common:logger_lib",
"//source/extensions/resource_monitors/common:factory_base_lib",
"//source/server:configuration_lib",
"@envoy_api//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg_cc_proto",
],
)
32 changes: 32 additions & 0 deletions source/extensions/resource_monitors/cpu_utilization/config.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "source/extensions/resource_monitors/cpu_utilization/config.h"

#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.h"
#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.validate.h"
#include "envoy/registry/registry.h"

#include "source/common/protobuf/utility.h"
#include "source/extensions/resource_monitors/cpu_utilization/cpu_utilization_monitor.h"
#include "source/extensions/resource_monitors/cpu_utilization/linux_cpu_stats_reader.h"

namespace Envoy {
namespace Extensions {
namespace ResourceMonitors {
namespace CpuUtilizationMonitor {

Server::ResourceMonitorPtr CpuUtilizationMonitorFactory::createResourceMonitorFromProtoTyped(
const envoy::extensions::resource_monitors::cpu_utilization::v3::CpuUtilizationConfig& config,
Server::Configuration::ResourceMonitorFactoryContext& /*unused_context*/) {
// In the future, the below can be configurable based on the operating system.
cancecen marked this conversation as resolved.
Show resolved Hide resolved
auto cpu_stats_reader = std::make_unique<LinuxCpuStatsReader>();
return std::make_unique<CpuUtilizationMonitor>(config, std::move(cpu_stats_reader));
}

/**
* Static registration for the cpu resource monitor factory. @see RegistryFactory.
*/
REGISTER_FACTORY(CpuUtilizationMonitorFactory, Server::Configuration::ResourceMonitorFactory);

} // namespace CpuUtilizationMonitor
} // namespace ResourceMonitors
} // namespace Extensions
} // namespace Envoy
29 changes: 29 additions & 0 deletions source/extensions/resource_monitors/cpu_utilization/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.h"
#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.validate.h"
#include "envoy/server/resource_monitor_config.h"

#include "source/extensions/resource_monitors/common/factory_base.h"

namespace Envoy {
namespace Extensions {
namespace ResourceMonitors {
namespace CpuUtilizationMonitor {

class CpuUtilizationMonitorFactory
: public Common::FactoryBase<
envoy::extensions::resource_monitors::cpu_utilization::v3::CpuUtilizationConfig> {
public:
CpuUtilizationMonitorFactory() : FactoryBase("envoy.resource_monitors.cpu_utilization") {}

private:
Server::ResourceMonitorPtr createResourceMonitorFromProtoTyped(
const envoy::extensions::resource_monitors::cpu_utilization::v3::CpuUtilizationConfig& config,
Server::Configuration::ResourceMonitorFactoryContext& context) override;
};

} // namespace CpuUtilizationMonitor
} // namespace ResourceMonitors
} // namespace Extensions
} // namespace Envoy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include <dirent.h>
#include <unistd.h>

#include <algorithm>
#include <filesystem>
#include <fstream>

#include "source/common/common/logger.h"

#include "absl/strings/str_split.h"

namespace Envoy {
namespace Extensions {
namespace ResourceMonitors {
namespace CpuUtilizationMonitor {

struct CpuTimes {
bool is_valid;
uint64_t work_time;
uint64_t total_time;
};

class CpuStatsReader {
public:
CpuStatsReader() = default;
virtual ~CpuStatsReader() = default;
virtual CpuTimes getCpuTimes() = 0;
};

} // namespace CpuUtilizationMonitor
} // namespace ResourceMonitors
} // namespace Extensions
} // namespace Envoy
Loading