Skip to content

Commit

Permalink
Dedupe auxiliary data strings in Azure Search Service (#634)
Browse files Browse the repository at this point in the history
  • Loading branch information
joelverhagen committed Aug 28, 2019
1 parent 035f754 commit b1f928e
Show file tree
Hide file tree
Showing 18 changed files with 396 additions and 69 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class Auxiliary2AzureSearchCommand
private readonly IOptionsSnapshot<Auxiliary2AzureSearchConfiguration> _options;
private readonly IAzureSearchTelemetryService _telemetryService;
private readonly ILogger<Auxiliary2AzureSearchCommand> _logger;
private readonly StringCache _stringCache;

public Auxiliary2AzureSearchCommand(
IAuxiliaryFileClient auxiliaryFileClient,
Expand All @@ -63,6 +64,7 @@ public Auxiliary2AzureSearchCommand(
_options = options ?? throw new ArgumentNullException(nameof(options));
_telemetryService = telemetryService ?? throw new ArgumentNullException(nameof(telemetryService));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_stringCache = new StringCache();

if (_options.Value.MaxConcurrentBatches <= 0)
{
Expand Down Expand Up @@ -100,7 +102,9 @@ private async Task<bool> CopyVerifiedPackagesAsync()
{
// The "old" data in this case is the latest file that was copied to the region's storage container by this
// job (or initialized by Db2AzureSearch).
var oldResult = await _verifiedPackagesDataClient.ReadLatestAsync(AccessConditionWrapper.GenerateEmptyCondition());
var oldResult = await _verifiedPackagesDataClient.ReadLatestAsync(
AccessConditionWrapper.GenerateEmptyCondition(),
_stringCache);

// The "new" data in this case is from the auxiliary data container that is updated by the
// Search.GenerateAuxiliaryData job.
Expand All @@ -126,7 +130,9 @@ private async Task<bool> PushIndexChangesAsync()
// The "old" data in this case is the download count data that was last indexed by this job (or
// initialized by Db2AzureSearch).
_logger.LogInformation("Fetching old download count data from blob storage.");
var oldResult = await _downloadDataClient.ReadLatestIndexedAsync(AccessConditionWrapper.GenerateEmptyCondition());
var oldResult = await _downloadDataClient.ReadLatestIndexedAsync(
AccessConditionWrapper.GenerateEmptyCondition(),
_stringCache);

// The "new" data in this case is from the statistics pipeline.
_logger.LogInformation("Fetching new download count data from blob storage.");
Expand Down
17 changes: 1 addition & 16 deletions src/NuGet.Services.AzureSearch/AuxiliaryFiles/DownloadData.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,6 @@ namespace NuGet.Services.AzureSearch.AuxiliaryFiles
{
public class DownloadData : IReadOnlyDictionary<string, DownloadByVersionData>
{
/// <summary>
/// Maintain a lookup of version strings for de-duping. We maintain the original case for de-duping purposes
/// by using the default string comparer. As of July of 2019 in PROD, maintaining original case adds less than
/// 0.3% extra strings. De-duping version strings in general however removes 87.0% of the string allocations.
/// Intuitively this means most people use the same case of a given version string and a lot of people use
/// the same versions strings (common ones are 1.0.0, 1.0.1, 1.0.2, 1.1.0, etc).
/// </summary>
private readonly Dictionary<string, string> _uniqueVersions = new Dictionary<string, string>();

private readonly Dictionary<string, DownloadByVersionData> _ids
= new Dictionary<string, DownloadByVersionData>(StringComparer.OrdinalIgnoreCase);

Expand Down Expand Up @@ -59,13 +50,7 @@ public void SetDownloadCount(string id, string version, long downloads)
versions = new DownloadByVersionData();
}

if (!_uniqueVersions.TryGetValue(version, out var dedupedVersion))
{
_uniqueVersions.Add(version, version);
dedupedVersion = version;
}

versions.SetDownloadCount(dedupedVersion, downloads);
versions.SetDownloadCount(version, downloads);

// Only store the download count if the value is not zero.
if (versions.Total != 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ public DownloadDataClient(

private ICloudBlobContainer Container => _lazyContainer.Value;

public async Task<AuxiliaryFileResult<DownloadData>> ReadLatestIndexedAsync(IAccessCondition accessCondition)
public async Task<AuxiliaryFileResult<DownloadData>> ReadLatestIndexedAsync(
IAccessCondition accessCondition,
StringCache stringCache)
{
var stopwatch = Stopwatch.StartNew();
var blobName = GetLatestIndexedBlobName();
Expand All @@ -56,7 +58,14 @@ public async Task<AuxiliaryFileResult<DownloadData>> ReadLatestIndexedAsync(IAcc
{
using (var stream = await blobReference.OpenReadAsync(accessCondition))
{
ReadStream(stream, downloads.SetDownloadCount);
ReadStream(
stream,
(id, version, downloadCount) =>
{
id = stringCache.Dedupe(id);
version = stringCache.Dedupe(version);
downloads.SetDownloadCount(id, version, downloadCount);
});
modified = true;
metadata = new AuxiliaryFileMetadata(
lastModified: new DateTimeOffset(blobReference.LastModifiedUtc, TimeSpan.Zero),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace NuGet.Services.AzureSearch.AuxiliaryFiles
{
public interface IDownloadDataClient
{
Task<AuxiliaryFileResult<DownloadData>> ReadLatestIndexedAsync(IAccessCondition accessCondition);
Task<AuxiliaryFileResult<DownloadData>> ReadLatestIndexedAsync(IAccessCondition accessCondition, StringCache stringCache);
Task ReplaceLatestIndexedAsync(DownloadData newData, IAccessCondition accessCondition);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace NuGet.Services.AzureSearch.AuxiliaryFiles
{
public interface IVerifiedPackagesDataClient
{
Task<AuxiliaryFileResult<HashSet<string>>> ReadLatestAsync(IAccessCondition accessCondition);
Task<AuxiliaryFileResult<HashSet<string>>> ReadLatestAsync(IAccessCondition accessCondition, StringCache stringCache);
Task ReplaceLatestAsync(HashSet<string> newData, IAccessCondition accessCondition);
}
}
78 changes: 78 additions & 0 deletions src/NuGet.Services.AzureSearch/AuxiliaryFiles/StringCache.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Collections.Concurrent;
using System.Threading;

namespace NuGet.Services.AzureSearch.AuxiliaryFiles
{
public class StringCache
{
/// <summary>
/// Maintain a lookup of strings for de-duping. We maintain the original case for de-duping purposes by using
/// the default string comparer. As of July of 2019 in PROD, maintaining original case of version
/// string adds less than 0.3% extra strings. De-duping version strings in a case-sensitive manner removes
/// 87.0% of the string allocations. Intuitively this means most people use the same case of a given version
/// string and a lot of people use the same versions strings (common ones are 1.0.0, 1.0.1, 1.0.2, 1.1.0, etc).
/// </summary>
private readonly ConcurrentDictionary<string, string> _values = new ConcurrentDictionary<string, string>();

/// <summary>
/// Keep track of the number of requests for a string. This is the number of times <see cref="Dedupe(string)"/>
/// has been called.
/// </summary>
private int _requestCount = 0;

/// <summary>
/// Keep track of the number of string de-duped, i.e. "cache hits".
/// </summary>
private int _hitCount = 0;

/// <summary>
/// Keep track of the number of characters in the cache.
/// </summary>
private long _charCount = 0;

public int StringCount => _values.Count;
public int RequestCount => _requestCount;
public int HitCount => _hitCount;
public long CharCount => _charCount;

public string Dedupe(string value)
{
Interlocked.Increment(ref _requestCount);

if (value == null)
{
return null;
}

// Inspired by:
// https://devblogs.microsoft.com/pfxteam/building-a-custom-getoradd-method-for-concurrentdictionarytkeytvalue/
while (true)
{
if (_values.TryGetValue(value, out var existingValue))
{
Interlocked.Increment(ref _hitCount);
return existingValue;
}

if (_values.TryAdd(value, value))
{
Interlocked.Add(ref _charCount, value.Length);
return value;
}
}
}

/// <summary>
/// Resets <see cref="RequestCount"/> and <see cref="HitCount"/> back to zero.
/// </summary>
public void ResetCounts()
{
_requestCount = 0;
_hitCount = 0;
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ public VerifiedPackagesDataClient(

private ICloudBlobContainer Container => _lazyContainer.Value;

public async Task<AuxiliaryFileResult<HashSet<string>>> ReadLatestAsync(IAccessCondition accessCondition)
public async Task<AuxiliaryFileResult<HashSet<string>>> ReadLatestAsync(
IAccessCondition accessCondition,
StringCache stringCache)
{
var stopwatch = Stopwatch.StartNew();
var blobName = GetLatestIndexedBlobName();
Expand All @@ -57,7 +59,7 @@ public async Task<AuxiliaryFileResult<HashSet<string>>> ReadLatestAsync(IAccessC
{
using (var stream = await blobReference.OpenReadAsync(accessCondition))
{
ReadStream(stream, id => data.Add(id));
ReadStream(stream, id => data.Add(stringCache.Dedupe(id)));
modified = true;
metadata = new AuxiliaryFileMetadata(
lastModified: new DateTimeOffset(blobReference.LastModifiedUtc, TimeSpan.Zero),
Expand Down
14 changes: 14 additions & 0 deletions src/NuGet.Services.AzureSearch/AzureSearchTelemetryService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -389,5 +389,19 @@ public IDisposable TrackReplaceLatestVerifiedPackages(int packageIdCount)
{ "PackageIdCount", packageIdCount.ToString() },
});
}

public void TrackAuxiliaryFilesStringCache(int stringCount, long charCount, int requestCount, int hitCount)
{
_telemetryClient.TrackMetric(
Prefix + "AuxiliaryFilesStringCache",
1,
new Dictionary<string, string>
{
{ "StringCount", stringCount.ToString() },
{ "CharCount", charCount.ToString() },
{ "RequestCount", requestCount.ToString() },
{ "HitCount", hitCount.ToString() },
});
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,6 @@ void TrackDownloadCountDecrease(
void TrackV2GetDocumentWithHijackIndex(TimeSpan elapsed);
void TrackReadLatestVerifiedPackages(int? packageIdCount, bool notModified, TimeSpan elapsed);
IDisposable TrackReplaceLatestVerifiedPackages(int packageIdCount);
void TrackAuxiliaryFilesStringCache(int stringCount, long charCount, int requestCount, int hitCount);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
<Compile Include="AuxiliaryFiles\IAuxiliaryDataStorageConfiguration.cs" />
<Compile Include="AuxiliaryFiles\IDownloadDataClient.cs" />
<Compile Include="AuxiliaryFiles\IVerifiedPackagesDataClient.cs" />
<Compile Include="AuxiliaryFiles\StringCache.cs" />
<Compile Include="AuxiliaryFiles\VerifiedPackagesDataClient.cs" />
<Compile Include="AzureSearchScoringConfiguration.cs" />
<Compile Include="BaseDocumentBuilder.cs" />
Expand Down
14 changes: 12 additions & 2 deletions src/NuGet.Services.AzureSearch/SearchService/AuxiliaryDataCache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public class AuxiliaryDataCache : IAuxiliaryDataCache
private readonly IVerifiedPackagesDataClient _verifiedPackagesDataClient;
private readonly IAzureSearchTelemetryService _telemetryService;
private readonly ILogger<AuxiliaryDataCache> _logger;
private readonly StringCache _stringCache;
private AuxiliaryData _data;

public AuxiliaryDataCache(
Expand All @@ -31,6 +32,7 @@ public AuxiliaryDataCache(
_verifiedPackagesDataClient = verifiedPackagesDataClient ?? throw new ArgumentNullException(nameof(verifiedPackagesDataClient));
_telemetryService = telemetryService ?? throw new ArgumentNullException(nameof(telemetryService));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_stringCache = new StringCache();
}

public bool Initialized => _data != null;
Expand Down Expand Up @@ -90,6 +92,14 @@ private async Task LoadAsync(TimeSpan timeout, bool shouldReload, CancellationTo
downloads,
verifiedPackages);

// Track the counts regarding the string cache status.
_telemetryService.TrackAuxiliaryFilesStringCache(
_stringCache.StringCount,
_stringCache.CharCount,
_stringCache.RequestCount,
_stringCache.HitCount);
_stringCache.ResetCounts();

stopwatch.Stop();
_telemetryService.TrackAuxiliaryFilesReload(reloadedNames, notModifiedNames, stopwatch.Elapsed);
_logger.LogInformation(
Expand All @@ -110,7 +120,7 @@ private async Task LoadAsync(TimeSpan timeout, bool shouldReload, CancellationTo

private async Task<AuxiliaryFileResult<T>> LoadAsync<T>(
AuxiliaryFileResult<T> previousResult,
Func<IAccessCondition, Task<AuxiliaryFileResult<T>>> getResult) where T : class
Func<IAccessCondition, StringCache, Task<AuxiliaryFileResult<T>>> getResult) where T : class
{
await Task.Yield();

Expand All @@ -124,7 +134,7 @@ private async Task<AuxiliaryFileResult<T>> LoadAsync<T>(
accessCondition = AccessConditionWrapper.GenerateIfNoneMatchCondition(previousResult.Metadata.ETag);
}

var newResult = await getResult(accessCondition);
var newResult = await getResult(accessCondition, _stringCache);
if (newResult.Modified)
{
return newResult;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ public async Task FailureIsRecordedInTelemetry()
{
var expected = new InvalidOperationException("Something bad!");
DownloadDataClient
.Setup(x => x.ReadLatestIndexedAsync(It.IsAny<IAccessCondition>()))
.Setup(x => x.ReadLatestIndexedAsync(It.IsAny<IAccessCondition>(), It.IsAny<StringCache>()))
.ThrowsAsync(expected);

var actual = await Assert.ThrowsAsync<InvalidOperationException>(() => Target.ExecuteAsync());
Expand Down Expand Up @@ -227,15 +227,15 @@ public Facts(ITestOutputHelper output)
OldDownloadData = new DownloadData();
OldDownloadResult = Data.GetAuxiliaryFileResult(OldDownloadData, "download-data-etag");
DownloadDataClient
.Setup(x => x.ReadLatestIndexedAsync(It.IsAny<IAccessCondition>()))
.Setup(x => x.ReadLatestIndexedAsync(It.IsAny<IAccessCondition>(), It.IsAny<StringCache>()))
.ReturnsAsync(() => OldDownloadResult);
NewDownloadData = new DownloadData();
AuxiliaryFileClient.Setup(x => x.LoadDownloadDataAsync()).ReturnsAsync(() => NewDownloadData);

OldVerifiedPackagesData = new HashSet<string>();
OldVerifiedPackagesResult = Data.GetAuxiliaryFileResult(OldVerifiedPackagesData, "verified-packages-etag");
VerifiedPackagesDataClient
.Setup(x => x.ReadLatestAsync(It.IsAny<IAccessCondition>()))
.Setup(x => x.ReadLatestAsync(It.IsAny<IAccessCondition>(), It.IsAny<StringCache>()))
.ReturnsAsync(() => OldVerifiedPackagesResult);
NewVerifiedPackagesData = new HashSet<string>();
AuxiliaryFileClient.Setup(x => x.LoadVerifiedPackagesAsync()).ReturnsAsync(() => NewVerifiedPackagesData);
Expand Down
Loading

0 comments on commit b1f928e

Please sign in to comment.