feat(bedrock): support metrics for bedrock (#1957)

Co-authored-by: Nir Gazit <nirga@users.noreply.github.com>
traceloop · Oct 2, 2024 · a30bb8c · a30bb8c
1 parent 346d752
commit a30bb8c
Show file tree

Hide file tree

Showing 18 changed files with 392 additions and 77 deletions.
diff --git a/...s/opentelemetry-instrumentation-bedrock/opentelemetry/instrumentation/bedrock/__init__.py b/...s/opentelemetry-instrumentation-bedrock/opentelemetry/instrumentation/bedrock/__init__.py
diff --git a/packages/opentelemetry-instrumentation-bedrock/tests/conftest.py b/packages/opentelemetry-instrumentation-bedrock/tests/conftest.py
@@ -3,39 +3,20 @@
 import os
 import pytest
 import boto3
+
 from opentelemetry import trace
-from opentelemetry.instrumentation.bedrock import BedrockInstrumentor
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 
-pytest_plugins = []
-
-
-@pytest.fixture(scope="session")
-def exporter():
-    exporter = InMemorySpanExporter()
-    processor = SimpleSpanProcessor(exporter)
-
-    provider = TracerProvider()
-    provider.add_span_processor(processor)
-    trace.set_tracer_provider(provider)
-
-    return exporter
-
-
-@pytest.fixture(scope="session", autouse=True)
-def instrument(exporter):
-    BedrockInstrumentor(enrich_token_usage=True).instrument()
-
-    yield
-
-    exporter.shutdown()
+from opentelemetry import metrics
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import InMemoryMetricReader
 
+from opentelemetry.instrumentation.bedrock import BedrockInstrumentor
 
-@pytest.fixture(autouse=True)
-def clear_exporter(exporter):
-    exporter.clear()
+pytest_plugins = []
 
 
 @pytest.fixture(autouse=True)
@@ -55,6 +36,40 @@ def brt():
     )
 
 
+@pytest.fixture(scope="session")
+def test_context():
+    resource = Resource.create()
+    reader = InMemoryMetricReader()
+    metricProvider = MeterProvider(metric_readers=[reader], resource=resource)
+    metrics.set_meter_provider(metricProvider)
+
+    spanExporter = InMemorySpanExporter()
+    processor = SimpleSpanProcessor(spanExporter)
+    tracer_provider = TracerProvider()
+    tracer_provider.add_span_processor(processor)
+    trace.set_tracer_provider(tracer_provider)
+
+    return spanExporter, metricProvider, reader
+
+
+@pytest.fixture(scope="session", autouse=True)
+def instrument(test_context):
+    BedrockInstrumentor(enrich_token_usage=True).instrument()
+
+    yield
+
+    exporter, provider, reader = test_context
+    exporter.shutdown()
+    reader.shutdown()
+    provider.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def clear_test_context(test_context):
+    exporter, _, _ = test_context
+    exporter.clear()
+
+
 @pytest.fixture(scope="module")
 def vcr_config():
     return {"filter_headers": ["authorization"]}
diff --git a/packages/opentelemetry-instrumentation-bedrock/tests/metrics/__init__.py b/packages/opentelemetry-instrumentation-bedrock/tests/metrics/__init__.py
@@ -0,0 +1 @@
+"""unit tests."""
diff --git a/...ation-bedrock/tests/metrics/cassettes/test_bedrock_metrics/test_invoke_model_metrics.yaml b/...ation-bedrock/tests/metrics/cassettes/test_bedrock_metrics/test_invoke_model_metrics.yaml
@@ -0,0 +1,54 @@
+interactions:
+- request:
+    body: '{"inputText": "Tell me a joke about opentelemetry", "textGenerationConfig":
+      {"maxTokenCount": 200, "temperature": 0.5, "topP": 0.5}}'
+    headers:
+      Accept:
+      - !!binary |
+        YXBwbGljYXRpb24vanNvbg==
+      Content-Length:
+      - '132'
+      Content-Type:
+      - !!binary |
+        YXBwbGljYXRpb24vanNvbg==
+      User-Agent:
+      - !!binary |
+        Qm90bzMvMS4zNC4xNjIgbWQvQm90b2NvcmUjMS4zNC4xNjIgdWEvMi4wIG9zL21hY29zIzIzLjYu
+        MCBtZC9hcmNoI2FybTY0IGxhbmcvcHl0aG9uIzMuMTEuNSBtZC9weWltcGwjQ1B5dGhvbiBjZmcv
+        cmV0cnktbW9kZSNsZWdhY3kgQm90b2NvcmUvMS4zNC4xNjI=
+      X-Amz-Date:
+      - !!binary |
+        MjAyNDA5MTlUMjE0NjE5Wg==
+      amz-sdk-invocation-id:
+      - !!binary |
+        MGVmMmNlZWUtNzA1OS00M2Y2LTk4OTUtZWUzMDdjNDFmNWI2
+      amz-sdk-request:
+      - !!binary |
+        YXR0ZW1wdD0x
+    method: POST
+    uri: https://bedrock-runtime.us-east-1.amazonaws.com/model/amazon.titan-text-express-v1/invoke
+  response:
+    body:
+      string: '{"inputTextTokenCount":9,"results":[{"tokenCount":17,"outputText":"\nWhat
+        do you call a bear with no teeth?\nA gummy bear.","completionReason":"FINISH"}]}'
+    headers:
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '154'
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 19 Sep 2024 21:46:20 GMT
+      X-Amzn-Bedrock-Input-Token-Count:
+      - '9'
+      X-Amzn-Bedrock-Invocation-Latency:
+      - '1155'
+      X-Amzn-Bedrock-Output-Token-Count:
+      - '17'
+      x-amzn-RequestId:
+      - 58c863f2-7a84-4bf8-8b93-1d51ca8aa150
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/packages/opentelemetry-instrumentation-bedrock/tests/metrics/test_bedrock_metrics.py b/packages/opentelemetry-instrumentation-bedrock/tests/metrics/test_bedrock_metrics.py
@@ -0,0 +1,68 @@
+import json
+
+import pytest
+from opentelemetry.semconv_ai import Meters, SpanAttributes
+
+
+@pytest.mark.vcr
+def test_invoke_model_metrics(test_context, brt):
+    if brt is None:
+        print("test_invoke_model_metrics test skipped.")
+        return
+
+    _, _, reader = test_context
+
+    body = json.dumps(
+        {
+            "inputText": "Tell me a joke about opentelemetry",
+            "textGenerationConfig": {
+                "maxTokenCount": 200,
+                "temperature": 0.5,
+                "topP": 0.5,
+            },
+        }
+    )
+
+    brt.invoke_model(
+        body=body,
+        modelId='amazon.titan-text-express-v1',
+        accept='application/json',
+        contentType='application/json'
+    )
+
+    metrics_data = reader.get_metrics_data()
+    resource_metrics = metrics_data.resource_metrics
+    assert len(resource_metrics) > 0
+
+    found_token_metric = False
+    found_duration_metric = False
+
+    for rm in resource_metrics:
+        for sm in rm.scope_metrics:
+            for metric in sm.metrics:
+
+                if metric.name == Meters.LLM_TOKEN_USAGE:
+                    found_token_metric = True
+                    for data_point in metric.data.data_points:
+                        assert data_point.attributes[SpanAttributes.LLM_TOKEN_TYPE] in [
+                            "output",
+                            "input",
+                        ]
+                        assert data_point.sum > 0
+
+                if metric.name == Meters.LLM_OPERATION_DURATION:
+                    found_duration_metric = True
+                    assert any(
+                        data_point.count > 0 for data_point in metric.data.data_points
+                    )
+                    assert any(
+                        data_point.sum > 0 for data_point in metric.data.data_points
+                    )
+
+                assert (
+                    metric.data.data_points[0].attributes[SpanAttributes.LLM_SYSTEM]
+                    == "bedrock"
+                )
+
+    assert found_token_metric is True
+    assert found_duration_metric is True
diff --git a/packages/opentelemetry-instrumentation-bedrock/tests/traces/__init__.py b/packages/opentelemetry-instrumentation-bedrock/tests/traces/__init__.py
@@ -0,0 +1 @@
+"""unit tests."""
diff --git a/...st_ai21_j2_completion_string_content.yaml → ...st_ai21_j2_completion_string_content.yaml b/...st_ai21_j2_completion_string_content.yaml → ...st_ai21_j2_completion_string_content.yaml
diff --git a/...nthropic/test_anthropic_2_completion.yaml → ...nthropic/test_anthropic_2_completion.yaml b/...nthropic/test_anthropic_2_completion.yaml → ...nthropic/test_anthropic_2_completion.yaml
diff --git a/...thropic_3_completion_complex_content.yaml → ...thropic_3_completion_complex_content.yaml b/...thropic_3_completion_complex_content.yaml → ...thropic_3_completion_complex_content.yaml
diff --git a/...est_anthropic_3_completion_streaming.yaml → ...est_anthropic_3_completion_streaming.yaml b/...est_anthropic_3_completion_streaming.yaml → ...est_anthropic_3_completion_streaming.yaml
diff --git a/...nthropic_3_completion_string_content.yaml → ...nthropic_3_completion_string_content.yaml b/...nthropic_3_completion_string_content.yaml → ...nthropic_3_completion_string_content.yaml
diff --git a/...eta_llama2_completion_string_content.yaml → ...eta_llama2_completion_string_content.yaml b/...eta_llama2_completion_string_content.yaml → ...eta_llama2_completion_string_content.yaml
diff --git a/...est_meta/test_meta_llama3_completion.yaml → ...est_meta/test_meta_llama3_completion.yaml b/...est_meta/test_meta_llama3_completion.yaml → ...est_meta/test_meta_llama3_completion.yaml
diff --git a/...tes/test_titan/test_titan_completion.yaml → ...tes/test_titan/test_titan_completion.yaml b/...tes/test_titan/test_titan_completion.yaml → ...tes/test_titan/test_titan_completion.yaml
diff --git a/...nstrumentation-bedrock/tests/test_ai21.py → ...ntation-bedrock/tests/traces/test_ai21.py b/...nstrumentation-bedrock/tests/test_ai21.py → ...ntation-bedrock/tests/traces/test_ai21.py
@@ -4,8 +4,8 @@
 import json
 
 
-@pytest.mark.vcr()
-def test_ai21_j2_completion_string_content(exporter, brt):
+@pytest.mark.vcr
+def test_ai21_j2_completion_string_content(test_context, brt):
     body = json.dumps(
         {
             "prompt": "Translate to spanish: 'Amazon Bedrock is the easiest way to build and"
@@ -26,6 +26,7 @@ def test_ai21_j2_completion_string_content(exporter, brt):
 
     response_body = json.loads(response.get("body").read())
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 

diff --git a/...mentation-bedrock/tests/test_anthropic.py → ...on-bedrock/tests/traces/test_anthropic.py b/...mentation-bedrock/tests/test_anthropic.py → ...on-bedrock/tests/traces/test_anthropic.py
@@ -5,7 +5,7 @@
 
 
 @pytest.mark.vcr
-def test_anthropic_2_completion(exporter, brt):
+def test_anthropic_2_completion(test_context, brt):
     body = json.dumps(
         {
             "prompt": "Human: Tell me a joke about opentelemetry Assistant:",
@@ -24,6 +24,7 @@ def test_anthropic_2_completion(exporter, brt):
     response_body = json.loads(response.get("body").read())
     completion = response_body.get("completion")
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 
@@ -48,7 +49,7 @@ def test_anthropic_2_completion(exporter, brt):
 
 
 @pytest.mark.vcr
-def test_anthropic_3_completion_complex_content(exporter, brt):
+def test_anthropic_3_completion_complex_content(test_context, brt):
     body = json.dumps(
         {
             "messages": [
@@ -75,6 +76,7 @@ def test_anthropic_3_completion_complex_content(exporter, brt):
     response_body = json.loads(response.get("body").read())
     completion = response_body.get("content")
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 
@@ -103,7 +105,7 @@ def test_anthropic_3_completion_complex_content(exporter, brt):
 
 
 @pytest.mark.vcr
-def test_anthropic_3_completion_streaming(exporter, brt):
+def test_anthropic_3_completion_streaming(test_context, brt):
     body = json.dumps(
         {
             "messages": [
@@ -135,6 +137,7 @@ def test_anthropic_3_completion_streaming(exporter, brt):
             if "delta" in decoded_chunk:
                 completion += decoded_chunk.get("delta").get("text") or ""
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 
@@ -165,7 +168,7 @@ def test_anthropic_3_completion_streaming(exporter, brt):
 
 
 @pytest.mark.vcr
-def test_anthropic_3_completion_string_content(exporter, brt):
+def test_anthropic_3_completion_string_content(test_context, brt):
     body = json.dumps(
         {
             "messages": [
@@ -190,6 +193,7 @@ def test_anthropic_3_completion_string_content(exporter, brt):
     response_body = json.loads(response.get("body").read())
     completion = response_body.get("content")
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 

diff --git a/...nstrumentation-bedrock/tests/test_meta.py → ...ntation-bedrock/tests/traces/test_meta.py b/...nstrumentation-bedrock/tests/test_meta.py → ...ntation-bedrock/tests/traces/test_meta.py
@@ -4,8 +4,8 @@
 import json
 
 
-@pytest.mark.vcr()
-def test_meta_llama2_completion_string_content(exporter, brt):
+@pytest.mark.vcr
+def test_meta_llama2_completion_string_content(test_context, brt):
     model_id = "meta.llama2-13b-chat-v1"
     prompt = """<s>[INST] <<SYS>>
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your
@@ -26,6 +26,7 @@ def test_meta_llama2_completion_string_content(exporter, brt):
 
     response_body = json.loads(response.get("body").read())
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 
@@ -44,8 +45,8 @@ def test_meta_llama2_completion_string_content(exporter, brt):
     )
 
 
-@pytest.mark.vcr()
-def test_meta_llama3_completion(exporter, brt):
+@pytest.mark.vcr
+def test_meta_llama3_completion(test_context, brt):
     model_id = "meta.llama3-70b-instruct-v1:0"
     prompt = "Tell me a joke about opentelemetry"
     # Create request body.
@@ -57,6 +58,7 @@ def test_meta_llama3_completion(exporter, brt):
 
     response_body = json.loads(response.get("body").read())
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)
 

diff --git a/...strumentation-bedrock/tests/test_titan.py → ...tation-bedrock/tests/traces/test_titan.py b/...strumentation-bedrock/tests/test_titan.py → ...tation-bedrock/tests/traces/test_titan.py
@@ -4,8 +4,8 @@
 import json
 
 
-@pytest.mark.vcr()
-def test_titan_completion(exporter, brt):
+@pytest.mark.vcr
+def test_titan_completion(test_context, brt):
     body = json.dumps(
         {
             "inputText": "Translate to spanish: 'Amazon Bedrock is the easiest way to build and"
@@ -28,6 +28,7 @@ def test_titan_completion(exporter, brt):
 
     response_body = json.loads(response.get("body").read())
 
+    exporter, _, _ = test_context
     spans = exporter.get_finished_spans()
     assert all(span.name == "bedrock.completion" for span in spans)