Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open Telemetry : Adds implementation for network level Metrics #4872

Merged
Merged
Show file tree
Hide file tree
Changes from 78 commits
Commits
Show all changes
82 commits
Select commit Hold shift + click to select a range
3d1d6ba
Added request level metrics
sourabh1007 Sep 13, 2024
60c411b
add IsClientMetricsEnabled option
sourabh1007 Sep 13, 2024
55b6505
added contract file
sourabh1007 Sep 13, 2024
6fb216c
wip
sourabh1007 Sep 16, 2024
841c7bc
adding test
sourabh1007 Sep 18, 2024
ee32e43
added documentation
sourabh1007 Oct 14, 2024
1ddbcbe
emit metrics
sourabh1007 Oct 16, 2024
7e9a344
fixed dimensions
sourabh1007 Oct 17, 2024
bc6f273
nonworking changes
sourabh1007 Oct 17, 2024
278c5c5
final commit
sourabh1007 Oct 18, 2024
8bae2a9
remove unnecessary dependencies
sourabh1007 Oct 18, 2024
cebace3
contract update
sourabh1007 Oct 18, 2024
6c32e52
fix merges
sourabh1007 Oct 18, 2024
f6974ea
remove console
sourabh1007 Oct 18, 2024
161abe9
add noops if disables
sourabh1007 Oct 18, 2024
2569ed9
added null check
sourabh1007 Oct 18, 2024
a3ee34d
[INTERNAL] CI: Fixes emulator set-up to leverage central SDK teams sc…
kirankumarkolli Oct 18, 2024
de96acc
VectorIndexDefinition: Adds Support for Partitioned DiskANN (#4792)
kundadebdatta Oct 18, 2024
2af0b05
Azurecore: Fixes upgrading azure core dependency to latest (#4819)
kirankumarkolli Oct 18, 2024
8d80c1c
DeleteAllItemsByPartitionKeyStreamAsync: Adds DeleteAllItemsByPartiti…
kirankumarkolli Oct 18, 2024
316b3d8
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Oct 22, 2024
c6a33b0
rename file
sourabh1007 Oct 22, 2024
4fd1193
refactor code
sourabh1007 Oct 23, 2024
033fda4
refactor code
sourabh1007 Oct 23, 2024
e92477a
perf tests
sourabh1007 Oct 23, 2024
cf5bb03
updated contracts
sourabh1007 Oct 24, 2024
6338908
code refactor
sourabh1007 Oct 24, 2024
321520c
refactored code
sourabh1007 Oct 25, 2024
51d485c
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Oct 25, 2024
7d03b8f
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Oct 26, 2024
1614b75
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 1, 2024
9081e1a
added region contacted as dimension
sourabh1007 Nov 1, 2024
66efcf9
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 6, 2024
ba63724
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 12, 2024
a451d49
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 14, 2024
736c292
perf fix
sourabh1007 Nov 14, 2024
6d6957b
inc perf test
sourabh1007 Nov 14, 2024
36d0ee1
perf results
sourabh1007 Nov 14, 2024
2509f3a
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 14, 2024
36079d4
refactor according to versioning
sourabh1007 Nov 18, 2024
b7ae941
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 19, 2024
d01a3c4
Merge branch 'master' into users/sourabhjain/otelmetriccpu
sourabh1007 Nov 20, 2024
e0efe5b
fix test
sourabh1007 Nov 20, 2024
369160b
refactor code
sourabh1007 Nov 20, 2024
1f64899
added try catch
sourabh1007 Nov 20, 2024
79b7307
added console
sourabh1007 Nov 20, 2024
f184813
updated contract
sourabh1007 Nov 20, 2024
ef2f2a5
code refatoring
sourabh1007 Nov 20, 2024
731951a
fix tetss
sourabh1007 Nov 20, 2024
a9eeba6
refactor code
sourabh1007 Nov 20, 2024
963824c
first drfat
sourabh1007 Nov 8, 2024
35c3141
updated
sourabh1007 Nov 18, 2024
eb289e5
fix test
sourabh1007 Nov 20, 2024
1280e13
updated sln
sourabh1007 Nov 21, 2024
675c49f
refactor code
sourabh1007 Nov 21, 2024
b4e2045
test fix
sourabh1007 Nov 21, 2024
3f30e87
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Nov 24, 2024
e4a7459
cleanup
sourabh1007 Nov 24, 2024
164d615
compile fix
sourabh1007 Nov 25, 2024
b03d026
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Nov 25, 2024
e618478
fix direct contract changes
sourabh1007 Nov 25, 2024
f576bc4
exception handling
sourabh1007 Nov 25, 2024
b4a0113
fix tets
sourabh1007 Nov 26, 2024
ff10923
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Nov 26, 2024
8ed1645
refcator code
sourabh1007 Nov 26, 2024
852ebf9
updated contract
sourabh1007 Nov 26, 2024
b71a164
fix bug
sourabh1007 Nov 26, 2024
d10cab2
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Nov 27, 2024
2c9dfcf
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Nov 27, 2024
f3679bc
fix test
sourabh1007 Nov 27, 2024
5e00973
add logs
sourabh1007 Nov 27, 2024
3b7fcb7
fix bug
sourabh1007 Nov 28, 2024
100273b
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Nov 28, 2024
22be226
update pipeline
sourabh1007 Nov 28, 2024
cf4951b
excption handling
sourabh1007 Nov 29, 2024
dc3f71a
fix tests
sourabh1007 Dec 2, 2024
f05a443
fix test
sourabh1007 Dec 2, 2024
66a30fc
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Dec 3, 2024
e5c8e0d
review commenst
sourabh1007 Dec 9, 2024
65609f4
Merge branch 'master' into users/sourabhjain/otelnetworkmetrics
sourabh1007 Dec 13, 2024
f91c17b
fix conflicts
sourabh1007 Dec 13, 2024
83c0a5d
update contract
sourabh1007 Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<ClientOfficialVersion>3.46.0</ClientOfficialVersion>
<ClientPreviewVersion>3.47.0</ClientPreviewVersion>
<ClientPreviewSuffixVersion>preview.0</ClientPreviewSuffixVersion>
<DirectVersion>3.37.1</DirectVersion>
<DirectVersion>3.37.2</DirectVersion>
<FaultInjectionVersion>1.0.0</FaultInjectionVersion>
<FaultInjectionSuffixVersion>beta.0</FaultInjectionSuffixVersion>
<EncryptionOfficialVersion>2.0.4</EncryptionOfficialVersion>
Expand Down
3 changes: 2 additions & 1 deletion Microsoft.Azure.Cosmos/src/DocumentClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,8 @@ internal virtual void Initialize(Uri serviceEndpoint,
if (this.cosmosClientTelemetryOptions.IsClientMetricsEnabled)
{
CosmosDbOperationMeter.Initialize();

CosmosDbNetworkMeter.Initialize();

CosmosDbOperationMeter.AddInstanceCount(this.ServiceEndpoint);
}

Expand Down
142 changes: 90 additions & 52 deletions Microsoft.Azure.Cosmos/src/Resource/ClientContextCore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ namespace Microsoft.Azure.Cosmos
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using global::Azure;
using Microsoft.Azure.Cosmos.Handlers;
using Microsoft.Azure.Cosmos.Resource.CosmosExceptions;
using Microsoft.Azure.Cosmos.Routing;
Expand Down Expand Up @@ -498,10 +497,13 @@ private async Task<TResult> RunWithDiagnosticsHelperAsync<TResult>(
RequestOptions requestOptions,
ResourceType? resourceType = null)
{
bool isOtelCompatibleOperation = openTelemetry != null && this.ShouldRecordTelemetry();
Uri gatewayEndpoint = this.client.Endpoint;

Func<string> getOperationName = () =>
{
// If opentelemetry is not enabled then return null operation name, so that no activity is created.
if (openTelemetry == null)
if (!isOtelCompatibleOperation)
{
return null;
}
Expand All @@ -513,81 +515,117 @@ private async Task<TResult> RunWithDiagnosticsHelperAsync<TResult>(
return openTelemetry.Item1;
};

using (OpenTelemetryCoreRecorder recorder =
using (OpenTelemetryCoreRecorder recorder = isOtelCompatibleOperation ?
OpenTelemetryRecorderFactory.CreateRecorder(
getOperationName: getOperationName,
containerName: containerName,
databaseName: databaseName,
operationType: operationType,
requestOptions: requestOptions,
trace: trace,
clientContext: this.isDisposed ? null : this))
clientContext: this.isDisposed ? null : this) : default)
using (new ActivityScope(Guid.NewGuid()))
{
try
{
TResult result = await task(trace).ConfigureAwait(false);
// Checks if OpenTelemetry is configured for this operation and either Trace or Metrics are enabled by customer
if (openTelemetry != null
&& (!this.ClientOptions.CosmosClientTelemetryOptions.DisableDistributedTracing || this.ClientOptions.CosmosClientTelemetryOptions.IsClientMetricsEnabled))
if (isOtelCompatibleOperation)
{
// Extracts and records telemetry data from the result of the operation.
OpenTelemetryAttributes response = openTelemetry?.Item2(result);

// Records the telemetry attributes for Distributed Tracing (if enabled)
recorder.Record(response);

// Records metrics such as request units, latency, and item count for the operation.
CosmosDbOperationMeter.RecordTelemetry(getOperationName: getOperationName,
accountName: this.client.Endpoint,
containerName: containerName,
databaseName: databaseName,
attributes: response);
OpenTelemetryAttributes otelAttributes = openTelemetry?.Item2(result);
sourabh1007 marked this conversation as resolved.
Show resolved Hide resolved

// Records the telemetry attributes for Distributed Tracing (if enabled) and Metrics
recorder.Record(otelAttributes);
RecordMetrics(getOperationName,
this.client.Endpoint,
containerName,
databaseName,
attributes: otelAttributes);
}
return result;
}
catch (OperationCanceledException oe) when (!(oe is CosmosOperationCanceledException))
{
CosmosOperationCanceledException operationCancelledException = new CosmosOperationCanceledException(oe, trace);
recorder.MarkFailed(operationCancelledException);

throw operationCancelledException;
}
catch (ObjectDisposedException objectDisposed) when (!(objectDisposed is CosmosObjectDisposedException))
{
CosmosObjectDisposedException objectDisposedException = new CosmosObjectDisposedException(
objectDisposed,
this.client,
trace);
recorder.MarkFailed(objectDisposedException);

throw objectDisposedException;
return result;
}
catch (NullReferenceException nullRefException) when (!(nullRefException is CosmosNullReferenceException))
catch (Exception ex) when (TryTransformException(ex, trace, this.client, out Exception cosmosException))
{
CosmosNullReferenceException nullException = new CosmosNullReferenceException(
nullRefException,
trace);
recorder.MarkFailed(nullException);
if (isOtelCompatibleOperation)
{
recorder.MarkFailed(cosmosException);
RecordMetrics(getOperationName,
gatewayEndpoint,
containerName,
databaseName,
cosmosException: cosmosException);
}

throw nullException;
throw cosmosException; // Rethrow after recording telemetry
}
catch (Exception ex)
{
recorder.MarkFailed(ex);
if (openTelemetry != null && ex is CosmosException cosmosException)
{
// Records telemetry data related to the exception.
CosmosDbOperationMeter.RecordTelemetry(getOperationName: getOperationName,
accountName: this.client.Endpoint,
containerName: containerName,
databaseName: databaseName,
ex: cosmosException);
}

throw;
// Fallback handling for exceptions not covered by the 'when' filter
recorder.MarkFailed(ex); // Record the exception using the telemetry recorder

// Optionally rethrow or handle the exception gracefully
throw; // Re-throwing to ensure the caller is aware of the unhandled exception
}

}
}

// Checks if telemetry is enabled
private bool ShouldRecordTelemetry()
{
CosmosClientTelemetryOptions telemetryOptions = this.clientOptions.CosmosClientTelemetryOptions;
return !telemetryOptions.DisableDistributedTracing || telemetryOptions.IsClientMetricsEnabled;
}

// Handles exceptions and records telemetry
private static bool TryTransformException(
Exception ex,
ITrace trace,
CosmosClient cosmosClient,
out Exception cosmosException)
{
cosmosException = ex switch
{
OperationCanceledException oe when oe is not CosmosOperationCanceledException =>
new CosmosOperationCanceledException(oe, trace),
ObjectDisposedException od when od is not CosmosObjectDisposedException =>
new CosmosObjectDisposedException(od, cosmosClient, trace),
NullReferenceException nr when nr is not CosmosNullReferenceException =>
new CosmosNullReferenceException(nr, trace),
Exception ce when ce is CosmosException => ex,
sourabh1007 marked this conversation as resolved.
Show resolved Hide resolved
_ => null
};

if (cosmosException is null)
{
return false;
}
return true;
}

private static void RecordMetrics(Func<string> getOperationName,
Uri accountName,
string containerName,
string databaseName,
OpenTelemetryAttributes attributes = null,
Exception cosmosException = null)
{
// Records telemetry data
CosmosDbOperationMeter.RecordTelemetry(getOperationName: getOperationName,
accountName: accountName,
containerName: containerName,
databaseName: databaseName,
attributes: attributes,
ex: cosmosException);

CosmosDbNetworkMeter.RecordTelemetry(getOperationName: getOperationName,
accountName: accountName,
containerName: containerName,
databaseName: databaseName,
attributes: attributes,
ex: cosmosException);
}

private async Task<ResponseMessage> ProcessResourceOperationAsBulkStreamAsync(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
//------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------

namespace Microsoft.Azure.Cosmos.Telemetry.Models
{
internal class NetworkMetricData
{
// Constructor
public NetworkMetricData(
double latency,
long? requestBodySize,
long? responseBodySize,
double backendLatency,
double? channelAcquisitionLatency,
double? transitTimeLatency,
double? receivedLatency)
{
this.Latency = latency;
this.RequestBodySize = requestBodySize;
this.ResponseBodySize = responseBodySize;
this.BackendLatency = backendLatency;
this.ChannelAcquisitionLatency = channelAcquisitionLatency;
this.TransitTimeLatency = transitTimeLatency;
this.ReceivedLatency = receivedLatency;
}

// Constructor
public NetworkMetricData(
double latency,
long? requestBodySize,
long? responseBodySize)
{
this.Latency = latency;
this.RequestBodySize = requestBodySize;
this.ResponseBodySize = responseBodySize;
}

public double Latency { get; }
public long? RequestBodySize { get; }
public long? ResponseBodySize { get; }
public double BackendLatency { get; }
public double? ChannelAcquisitionLatency { get; }
public double? TransitTimeLatency { get; }
public double? ReceivedLatency { get; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------

namespace Microsoft.Azure.Cosmos.Telemetry.Models
{
internal class OperationMetricData
{
public OperationMetricData(string itemCount, double? requestCharge)
{
this.ItemCount = itemCount;
this.RequestCharge = requestCharge;
}

public string ItemCount { get; }
FabianMeiswinkel marked this conversation as resolved.
Show resolved Hide resolved

public double? RequestCharge { get; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ namespace Microsoft.Azure.Cosmos.Telemetry
using System;
using System.Collections.Generic;
using global::Azure.Core;
using Microsoft.Azure.Cosmos.Tracing.TraceData;

internal sealed class AppInsightClassicAttributeKeys : IActivityAttributePopulator
{
Expand Down Expand Up @@ -142,7 +143,10 @@ public void PopulateAttributes(DiagnosticScope scope, Exception exception)
}
}

public void PopulateAttributes(DiagnosticScope scope, QueryTextMode? queryTextMode, string operationType, OpenTelemetryAttributes response)
public void PopulateAttributes(DiagnosticScope scope,
QueryTextMode? queryTextMode,
string operationType,
OpenTelemetryAttributes response)
{
scope.AddAttribute(AppInsightClassicAttributeKeys.OperationType, operationType);
if (response != null)
Expand All @@ -162,17 +166,50 @@ public void PopulateAttributes(DiagnosticScope scope, QueryTextMode? queryTextMo
}
}

public KeyValuePair<string, object>[] PopulateOperationMeterDimensions(string operationName, string containerName, string databaseName, Uri accountName, OpenTelemetryAttributes attributes, CosmosException ex)
public KeyValuePair<string, object>[] PopulateNetworkMeterDimensions(string operationName,
Uri accountName,
string containerName,
string databaseName,
OpenTelemetryAttributes attributes,
Exception ex,
ClientSideRequestStatisticsTraceDatum.StoreResponseStatistics tcpStats = null,
ClientSideRequestStatisticsTraceDatum.HttpResponseStatistics? httpStats = null)
{
return new KeyValuePair<string, object>[]
{
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.ContainerName, containerName),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.DbName, databaseName),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.ServerAddress, accountName.Host),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.ServerAddress, accountName?.Host),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.DbOperation, operationName),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.StatusCode, (int)(attributes?.StatusCode ?? ex?.StatusCode)),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.SubStatusCode, attributes?.SubStatusCode ?? ex?.SubStatusCode),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.Region, string.Join(",", attributes.Diagnostics.GetContactedRegions()))
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.StatusCode, CosmosDbMeterUtil.GetStatusCode(attributes, ex)),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.SubStatusCode, CosmosDbMeterUtil.GetSubStatusCode(attributes, ex))
};
}

public KeyValuePair<string, object>[] PopulateOperationMeterDimensions(string operationName,
string containerName,
string databaseName,
Uri accountName,
OpenTelemetryAttributes attributes,
Exception ex)
{
return new KeyValuePair<string, object>[]
{
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.ContainerName, containerName),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.DbName, databaseName),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.ServerAddress, accountName?.Host),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.DbOperation, operationName),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.StatusCode, CosmosDbMeterUtil.GetStatusCode(attributes, ex)),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.SubStatusCode, CosmosDbMeterUtil.GetSubStatusCode(attributes, ex)),
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.Region, CosmosDbMeterUtil.GetRegions(attributes?.Diagnostics))
};
}

public KeyValuePair<string, object>[] PopulateInstanceCountDimensions(Uri accountEndpoint)
{
return new[]
{
new KeyValuePair<string, object>(AppInsightClassicAttributeKeys.ServerAddress, accountEndpoint.Host)
};
}
}
Expand Down
Loading
Loading