Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial Caffeine CacheStats recorder #1897

Merged
merged 16 commits into from
Mar 7, 2024
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,21 @@ Service instrumentedService = Tritium.instrument(Service.class,
interestingService, environment.metrics());
```

## Instrumenting a [Caffeine cache](https://github.com/ben-manes/caffeine/)

```java
import com.palantir.tritium.metrics.caffeine.CacheStats;

TaggedMetricRegistry taggedMetricRegistry = ...
Cache<Integer, String> cache = cacheStats.register(Caffeine.newBuilder()
.recordStats(CacheStats.of(taggedMetricRegistry, "unique-cache-name"))
.build());

LoadingCache<String, Integer> loadingCache = loadingCacheStats.register(Caffeine.newBuilder()
.recordStats(CacheStats.of(taggedMetricRegistry, "unique-loading-cache-name"))
.build(key::length));
```

## Creating a metric registry with reservoirs backed by [HDR Histograms](https://hdrhistogram.github.io/HdrHistogram/).

HDR histograms are more useful if the service is long running, so the stats represents the lifetime of the server rather than using default exponential decay which can lead to some mis-interpretations of timings (especially higher percentiles and things like max dropping over time) if the consumer isn't aware of these assumptions.
Expand Down
5 changes: 5 additions & 0 deletions changelog/@unreleased/pr-1897.v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
type: improvement
improvement:
description: Initial Caffeine CacheStats recorder
links:
- https://github.com/palantir/tritium/pull/1897
2 changes: 2 additions & 0 deletions tritium-caffeine/build.gradle
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
apply plugin: 'com.palantir.external-publish-jar'
apply plugin: 'com.palantir.metric-schema'

dependencies {
api 'com.github.ben-manes.caffeine:caffeine'
Expand All @@ -11,6 +12,7 @@ dependencies {
implementation 'com.palantir.safe-logging:preconditions'
implementation 'com.palantir.safe-logging:safe-logging'
implementation 'io.dropwizard.metrics:metrics-core'
implementation 'org.checkerframework:checker-qual'

testImplementation 'org.assertj:assertj-core'
testImplementation 'org.awaitility:awaitility'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* (c) Copyright 2024 Palantir Technologies Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.palantir.tritium.metrics.caffeine;

import com.codahale.metrics.Counter;
import com.github.benmanes.caffeine.cache.RemovalCause;
import com.github.benmanes.caffeine.cache.stats.StatsCounter;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.palantir.logsafe.Safe;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import java.util.Arrays;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Supplier;
import org.checkerframework.checker.index.qual.NonNegative;

public final class CacheStats implements StatsCounter, Supplier<StatsCounter> {
private final String name;
private final Counter hitCounter;
private final Counter missCounter;
private final Counter loadSuccessCounter;
private final Counter loadFailureCounter;
private final Counter evictionsTotalCounter;
private final ImmutableMap<RemovalCause, Counter> evictionCounters;
private final LongAdder totalLoadNanos = new LongAdder();

/**
* Creates a {@link CacheStats} instance that registers metrics for Caffeine cache statistics.
* <p>
* Example usage for a {@link com.github.benmanes.caffeine.cache.Cache} or
* {@link com.github.benmanes.caffeine.cache.LoadingCache}:
* <pre>
* LoadingCache&lt;Integer, String> cache = Caffeine.newBuilder()
* .recordStats(CacheStats.of(taggedMetricRegistry, "your-cache-name"))
* .build(key -> computeSomethingExpensive(key));
* </pre>
* @param taggedMetricRegistry tagged metric registry to add cache metrics
* @param name cache name
* @return Caffeine stats instance to register via
* {@link com.github.benmanes.caffeine.cache.Caffeine#recordStats(Supplier)}.
*/
public static CacheStats of(TaggedMetricRegistry taggedMetricRegistry, @Safe String name) {
return new CacheStats(CacheMetrics.of(taggedMetricRegistry), name);
}

private CacheStats(CacheMetrics metrics, @Safe String name) {
this.name = name;
this.hitCounter = metrics.hitCount(name);
this.missCounter = metrics.missCount(name);
this.loadSuccessCounter = metrics.loadSuccessCount(name);
this.loadFailureCounter = metrics.loadFailureCount(name);
this.evictionsTotalCounter = metrics.evictionCount(name);
this.evictionCounters = Arrays.stream(RemovalCause.values())
.collect(Maps.toImmutableEnumMap(cause -> cause, cause -> metrics.evictions()
.cache(name)
.cause(cause.toString())
.build()));
}

@Override
public StatsCounter get() {
return this;
}

@Override
public void recordHits(@NonNegative int count) {
hitCounter.inc(count);
}

@Override
public void recordMisses(@NonNegative int count) {
missCounter.inc(count);
}

@Override
public void recordLoadSuccess(@NonNegative long loadTime) {
loadSuccessCounter.inc();
totalLoadNanos.add(loadTime);
}

@Override
public void recordLoadFailure(@NonNegative long loadTime) {
loadFailureCounter.inc();
totalLoadNanos.add(loadTime);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose we could make the meter into a timer, which also provides a “count” value in addition to percentiles, more timeseries though

}

@Override
public void recordEviction(@NonNegative int weight, RemovalCause cause) {
Counter counter = evictionCounters.get(cause);
if (counter != null) {
counter.inc(weight);
}
evictionsTotalCounter.inc(weight);
}

@Override
public com.github.benmanes.caffeine.cache.stats.CacheStats snapshot() {
return com.github.benmanes.caffeine.cache.stats.CacheStats.of(
hitCounter.getCount(),
missCounter.getCount(),
loadSuccessCounter.getCount(),
loadFailureCounter.getCount(),
totalLoadNanos.sum(),
evictionsTotalCounter.getCount(),
evictionCounters.values().stream().mapToLong(Counter::getCount).sum());
}

@Override
public String toString() {
return name + ": " + snapshot();
}
}
38 changes: 38 additions & 0 deletions tritium-caffeine/src/main/metrics/cache-metrics.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
options:
javaPackage: com.palantir.tritium.metrics.caffeine
javaVisibility: packagePrivate
namespaces:
cache:
docs: Cache statistic metrics
metrics:
hit.count:
type: counter
tags: [cache]
docs: Count of cache hits
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I missed this in the previous round of review, I think all the counters should be updated to type: meter. Counter is interpreted as a gauge in most backends because it doesn't guarantee monotonic increasing values, where our Meter, confusingly, uses the Prometheus Counter type for slightly easier indexing.

We may want to remove the .count name suffixes as well (otherwise we'll end up with queries for cache_hit_count_count), since those are described by both the counter and meter metric types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I'm trying to keep metric name compatibility with the existing metrics produced by com.palantir.tritium.metrics.caffeine.CaffeineCacheStats#registerCache(com.palantir.tritium.metrics.registry.TaggedMetricRegistry, com.github.benmanes.caffeine.cache.Cache<?,?>, java.lang.String) for now as I'm not sure how complex the migration will be across the fleet.

miss.count:
type: counter
tags: [cache]
docs: Count of cache misses
load.success.count:
type: counter
tags: [cache]
docs: Count of successful cache loads
load.failure.count:
type: counter
tags: [cache]
docs: Count of failed cache loads
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking out loud here, entirely up to you either way:

We could declare a single metric along these lines, using a tag to distinguish success and failure:

load:
  type: meter
  tags:
    - name: cache
    - name: result
       values: [success, failure]

Sometimes this makes it easier to track total interactions

evictions:
type: counter
tags: [cache, cause]
docs: Count of evicted entries by cause
eviction.count:
type: counter
tags: [cache]
docs: Total count of evicted entries
stats.disabled:
type: counter
tags: [cache]
docs: |
Registered cache does not have stats recording enabled, stats will always be zero.
To enable cache metrics, stats recording must be enabled when constructing the cache:
Caffeine.newBuilder().recordStats()
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.palantir.tritium.metrics.caffeine;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.InstanceOfAssertFactories.collection;
import static org.assertj.core.api.InstanceOfAssertFactories.type;
import static org.awaitility.Awaitility.await;

Expand All @@ -27,13 +28,23 @@
import com.codahale.metrics.MetricRegistry;
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Multimaps;
import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry;
import com.palantir.tritium.metrics.registry.MetricName;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import java.time.Duration;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import org.assertj.core.api.AbstractLongAssert;
import org.assertj.core.api.AbstractObjectAssert;
import org.assertj.core.api.InstanceOfAssertFactories;
import org.assertj.core.api.ObjectAssert;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -202,21 +213,145 @@ void registerCacheWithoutRecordingStatsTagged() {
.isEqualTo(1L);
}

static AbstractObjectAssert<?, ?> assertGauge(TaggedMetricRegistry taggedMetricRegistry, String name) {
Gauge<?> metric = getMetric(taggedMetricRegistry, Gauge.class, name);
@Test
void registerTaggedMetrics() {
Cache<Integer, String> cache = Caffeine.newBuilder()
.recordStats(CacheStats.of(taggedMetricRegistry, "test"))
.maximumSize(2)
.build();
assertThat(taggedMetricRegistry.getMetrics().keySet())
.extracting(MetricName::safeName)
.containsExactlyInAnyOrder(
"cache.hit.count",
"cache.miss.count",
"cache.eviction.count",
"cache.evictions", // RemovalCause.EXPLICIT
"cache.evictions", // RemovalCause.REPLACED
"cache.evictions", // RemovalCause.COLLECTED
"cache.evictions", // RemovalCause.EXPIRED
"cache.evictions", // RemovalCause.SIZE
"cache.load.success.count",
"cache.load.failure.count");

CacheMetrics cacheMetrics = CacheMetrics.of(taggedMetricRegistry);
assertThat(cacheMetrics.hitCount("test").getCount()).isZero();
assertThat(cacheMetrics.evictions().cache("test").cause("SIZE").build().getCount())
.asInstanceOf(InstanceOfAssertFactories.LONG)
.isZero();
assertCounter(taggedMetricRegistry, "cache.hit.count").isZero();
assertCounter(taggedMetricRegistry, "cache.miss.count").isZero();

assertThat(cache.get(0, mapping)).isEqualTo("0");
assertThat(cache.get(1, mapping)).isEqualTo("1");
assertThat(cache.get(2, mapping)).isEqualTo("2");
assertThat(cache.get(1, mapping)).isEqualTo("1");

assertCounter(taggedMetricRegistry, "cache.hit.count").isOne();
assertCounter(taggedMetricRegistry, "cache.miss.count").isEqualTo(3);
assertThat(cacheMetrics.hitCount("test").getCount()).isOne();
assertThat(cacheMetrics.missCount("test").getCount()).isEqualTo(3);
assertThat(cacheMetrics.evictions().cache("test").cause("SIZE").build().getCount())
.asInstanceOf(InstanceOfAssertFactories.LONG)
.isOne();

assertThat(taggedMetricRegistry.getMetrics())
.extractingByKey(MetricName.builder()
.safeName("cache.stats.disabled")
.putSafeTags("cache", "test")
.build())
.isNull();
}

@Test
void registerLoadingTaggedMetrics() {
LoadingCache<Integer, String> cache = Caffeine.newBuilder()
.recordStats(CacheStats.of(taggedMetricRegistry, "test"))
.maximumSize(2)
.build(mapping::apply);
assertThat(taggedMetricRegistry.getMetrics().keySet())
.extracting(MetricName::safeName)
.containsExactlyInAnyOrder(
"cache.hit.count",
"cache.miss.count",
"cache.eviction.count",
"cache.evictions", // RemovalCause.EXPLICIT
"cache.evictions", // RemovalCause.REPLACED
"cache.evictions", // RemovalCause.COLLECTED
"cache.evictions", // RemovalCause.EXPIRED
"cache.evictions", // RemovalCause.SIZE
"cache.load.success.count",
"cache.load.failure.count");

assertCounter(taggedMetricRegistry, "cache.hit.count").isZero();
assertCounter(taggedMetricRegistry, "cache.miss.count").isZero();

assertThat(cache.get(0)).isEqualTo("0");
assertThat(cache.get(1)).isEqualTo("1");
assertThat(cache.get(2)).isEqualTo("2");
assertThat(cache.get(1)).isEqualTo("1");

assertCounter(taggedMetricRegistry, "cache.hit.count").isOne();
assertCounter(taggedMetricRegistry, "cache.miss.count").isEqualTo(3);
CacheMetrics cacheMetrics = CacheMetrics.of(taggedMetricRegistry);
assertThat(cacheMetrics.hitCount("test").getCount()).isOne();
assertThat(cacheMetrics.missCount("test").getCount()).isEqualTo(3);
assertThat(cacheMetrics.evictions().cache("test").cause("SIZE").build().getCount())
.asInstanceOf(InstanceOfAssertFactories.LONG)
.isOne();

assertThat(taggedMetricRegistry.getMetrics())
.extractingByKey(MetricName.builder()
.safeName("cache.stats.disabled")
.putSafeTags("cache", "test")
.build())
.isNull();
}

static AbstractObjectAssert<?, Object> assertGauge(TaggedMetricRegistry taggedMetricRegistry, String name) {
return assertMetric(taggedMetricRegistry, Gauge.class, name).extracting(Gauge::getValue);
}

static AbstractLongAssert<?> assertCounter(TaggedMetricRegistry taggedMetricRegistry, String name) {
return assertMetric(taggedMetricRegistry, Counter.class, name)
.extracting(Counter::getCount)
.asInstanceOf(InstanceOfAssertFactories.LONG);
}

static <T extends Metric> ObjectAssert<T> assertMetric(
TaggedMetricRegistry taggedMetricRegistry, Class<T> clazz, String name) {
T metric = getMetric(taggedMetricRegistry, clazz, name);
return assertThat(metric)
.as("metric '%s': '%s'", name, metric)
.isNotNull()
.extracting(Gauge::getValue);
.asInstanceOf(type(clazz));
}

private static <T extends Metric> T getMetric(TaggedMetricRegistry metrics, Class<T> clazz, String name) {
return clazz.cast(metrics.getMetrics().entrySet().stream()
Optional<Entry<MetricName, Metric>> metric = metrics.getMetrics().entrySet().stream()
.filter(e -> name.equals(e.getKey().safeName()))
.filter(e -> clazz.isInstance(e.getValue()))
.findFirst()
.orElseThrow(() -> new IllegalArgumentException(
"No such metric '" + name + "' of type " + clazz.getCanonicalName()))
.getValue());
.findFirst();
if (metric.isEmpty()) {
Map<String, Set<String>> metricNameToType = Multimaps.asMap(metrics.getMetrics().entrySet().stream()
.filter(e -> Objects.nonNull(e.getKey()))
.filter(e -> Objects.nonNull(e.getValue()))
.collect(ImmutableSetMultimap.toImmutableSetMultimap(
e -> e.getKey().safeName(), e -> Optional.ofNullable(e.getValue())
.map(x -> x.getClass().getCanonicalName())
.orElse(""))));

assertThat(metricNameToType)
.containsKey(name)
.extractingByKey(name)
.asInstanceOf(collection(String.class))
.contains(clazz.getCanonicalName());

assertThat(metric)
.as(
"Metric named '%s' of type '%s' should exist but was not found in [%s]",
name, clazz.getCanonicalName(), metricNameToType.keySet())
.isPresent();
}
return clazz.cast(metric.orElseThrow().getValue());
}
}