Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add experimental configuration option to allow disabling legacy Prometheus metric names. #13540

Merged
merged 16 commits into from
Aug 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13540.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add experimental configuration option to allow disabling legacy Prometheus metric names.
39 changes: 36 additions & 3 deletions synapse/app/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,15 +266,48 @@ async def wrapper() -> None:
reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper()))


def listen_metrics(bind_addresses: Iterable[str], port: int) -> None:
def listen_metrics(
bind_addresses: Iterable[str], port: int, enable_legacy_metric_names: bool
) -> None:
"""
Start Prometheus metrics server.
"""
from synapse.metrics import RegistryProxy, start_http_server
from prometheus_client import start_http_server as start_http_server_prometheus

from synapse.metrics import (
RegistryProxy,
start_http_server as start_http_server_legacy,
)

for host in bind_addresses:
logger.info("Starting metrics listener on %s:%d", host, port)
start_http_server(port, addr=host, registry=RegistryProxy)
if enable_legacy_metric_names:
start_http_server_legacy(port, addr=host, registry=RegistryProxy)
else:
_set_prometheus_client_use_created_metrics(False)
start_http_server_prometheus(port, addr=host, registry=RegistryProxy)


def _set_prometheus_client_use_created_metrics(new_value: bool) -> None:
"""
Sets whether prometheus_client should expose `_created`-suffixed metrics for
all gauges, histograms and summaries.
There is no programmatic way to disable this without poking at internals;
the proper way is to use an environment variable which prometheus_client
loads at import time.

The motivation for disabling these `_created` metrics is that they're
a waste of space as they're not useful but they take up space in Prometheus.
"""

import prometheus_client.metrics

if hasattr(prometheus_client.metrics, "_use_created"):
prometheus_client.metrics._use_created = new_value
else:
logger.error(
"Can't disable `_created` metrics in prometheus_client (brittle hack broken?)"
)


def listen_manhole(
Expand Down
6 changes: 5 additions & 1 deletion synapse/app/generic_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,11 @@ def start_listening(self) -> None:
"enable_metrics is not True!"
)
else:
_base.listen_metrics(listener.bind_addresses, listener.port)
_base.listen_metrics(
listener.bind_addresses,
listener.port,
enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
)
else:
logger.warning("Unsupported listener type: %s", listener.type)

Expand Down
6 changes: 5 additions & 1 deletion synapse/app/homeserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,11 @@ def start_listening(self) -> None:
"enable_metrics is not True!"
)
else:
_base.listen_metrics(listener.bind_addresses, listener.port)
_base.listen_metrics(
listener.bind_addresses,
listener.port,
enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
)
else:
# this shouldn't happen, as the listener type should have been checked
# during parsing
Expand Down
29 changes: 29 additions & 0 deletions synapse/config/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,35 @@ class MetricsConfig(Config):

def read_config(self, config: JsonDict, **kwargs: Any) -> None:
self.enable_metrics = config.get("enable_metrics", False)

"""
### `enable_legacy_metrics` (experimental)

**Experimental: this option may be removed or have its behaviour
changed at any time, with no notice.**

Set to `true` to publish both legacy and non-legacy Prometheus metric names,
or to `false` to only publish non-legacy Prometheus metric names.
Defaults to `true`. Has no effect if `enable_metrics` is `false`.

Legacy metric names include:
- metrics containing colons in the name, such as `synapse_util_caches_response_cache:hits`, because colons are supposed to be reserved for user-defined recording rules;
- counters that don't end with the `_total` suffix, such as `synapse_federation_client_sent_edus`, therefore not adhering to the OpenMetrics standard.

These legacy metric names are unconventional and not compliant with OpenMetrics standards.
They are included for backwards compatibility.

Example configuration:
```yaml
enable_legacy_metrics: false
```

See https://github.com/matrix-org/synapse/issues/11106 for context.

*Since v1.67.0.*
"""
self.enable_legacy_metrics = config.get("enable_legacy_metrics", True)

self.report_stats = config.get("report_stats", None)
self.report_stats_endpoint = config.get(
"report_stats_endpoint", "https://matrix.org/report-usage-stats/push"
Expand Down
4 changes: 2 additions & 2 deletions synapse/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@

# This module is imported for its side effects; flake8 needn't warn that it's unused.
import synapse.metrics._reactor_metrics # noqa: F401
from synapse.metrics._exposition import (
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
from synapse.metrics._legacy_exposition import (
MetricsResource,
generate_latest,
start_http_server,
)
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
from synapse.metrics._types import Collector
from synapse.util import SYNAPSE_VERSION

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,27 @@ def sample_line(line: Sample, name: str) -> str:
return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)


# Mapping from new metric names to legacy metric names.
# We translate these back to their old names when exposing them through our
# legacy vendored exporter.
# Only this legacy exposition module applies these name changes.
LEGACY_METRIC_NAMES = {
"synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
"synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
"synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
"synapse_util_caches_cache_total": "synapse_util_caches_cache:total",
"synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
"synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
"synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
"synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total",
}


def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
"""
Generate metrics in legacy format. Modern metrics are generated directly
by prometheus-client.
"""

# Trigger the cache metrics to be rescraped, which updates the common
# metrics but do not produce metrics themselves
Expand All @@ -94,7 +114,8 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
# No samples, don't bother.
continue

mname = metric.name
# Translate to legacy metric name if it has one.
mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name)
mnewname = metric.name
mtype = metric.type

Expand Down Expand Up @@ -124,7 +145,7 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
om_samples: Dict[str, List[str]] = {}
for s in metric.samples:
for suffix in ["_created", "_gsum", "_gcount"]:
if s.name == metric.name + suffix:
if s.name == mname + suffix:
# OpenMetrics specific sample, put in a gauge at the end.
# (these come from gaugehistograms which don't get renamed,
# so no need to faff with mnewname)
Expand All @@ -140,12 +161,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
if emit_help:
output.append(
"# HELP {}{} {}\n".format(
metric.name,
mname,
suffix,
metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
)
)
output.append(f"# TYPE {metric.name}{suffix} gauge\n")
output.append(f"# TYPE {mname}{suffix} gauge\n")
output.extend(lines)

# Get rid of the weird colon things while we're at it
Expand All @@ -170,11 +191,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
# Get rid of the OpenMetrics specific samples (we should already have
# dealt with them above anyway.)
for suffix in ["_created", "_gsum", "_gcount"]:
if s.name == metric.name + suffix:
if s.name == mname + suffix:
break
else:
sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name)
output.append(
sample_line(s, s.name.replace(":total", "").replace(":", "_"))
sample_line(s, sample_name.replace(":total", "").replace(":", "_"))
)

return "".join(output).encode("utf-8")
Expand Down
16 changes: 8 additions & 8 deletions synapse/util/caches/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,23 @@
caches_by_name: Dict[str, Sized] = {}
collectors_by_name: Dict[str, "CacheMetric"] = {}

cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"])
cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"])
cache_total = Gauge("synapse_util_caches_cache_total", "", ["name"])
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
cache_memory_usage = Gauge(
"synapse_util_caches_cache_size_bytes",
"Estimated memory usage of the caches",
["name"],
)

response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
response_cache_size = Gauge("synapse_util_caches_response_cache_size", "", ["name"])
response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["name"])
response_cache_evicted = Gauge(
"synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
"synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"]
)
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
response_cache_total = Gauge("synapse_util_caches_response_cache_total", "", ["name"])


class EvictionReason(Enum):
Expand Down
36 changes: 36 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
from importlib import metadata
except ImportError:
import importlib_metadata as metadata # type: ignore[no-redef]

from unittest.mock import patch

from pkg_resources import parse_version

from synapse.app._base import _set_prometheus_client_use_created_metrics
from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
from synapse.util.caches.deferred_cache import DeferredCache

Expand Down Expand Up @@ -162,3 +171,30 @@ def test_cache_metric(self):

self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")


class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase):
if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"):
skip = "prometheus-client too old"

def test_created_metrics_disabled(self) -> None:
"""
Tests that a brittle hack, to disable `_created` metrics, works.
This involves poking at the internals of prometheus-client.
It's not the end of the world if this doesn't work.

This test gives us a way to notice if prometheus-client changes
their internals.
"""
import prometheus_client.metrics

PRIVATE_FLAG_NAME = "_use_created"

# By default, the pesky `_created` metrics are enabled.
# Check this assumption is still valid.
self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME))

with patch("prometheus_client.metrics") as mock:
setattr(mock, PRIVATE_FLAG_NAME, True)
_set_prometheus_client_use_created_metrics(False)
self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))