Skip to content

Commit

Permalink
Introduce versioning for DiagnosticEvent (#10011) (#10024) (#10027)
Browse files Browse the repository at this point in the history
Co-authored-by: Lilian Kasem <likasem@microsoft.com>
  • Loading branch information
v-imohammad and liliankasem authored Apr 23, 2024
1 parent 2a9dfe2 commit e07dacb
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 25 deletions.
5 changes: 5 additions & 0 deletions src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEvent.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,20 @@ namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics
{
public class DiagnosticEvent : TableEntity
{
internal const string CurrentEventVersion = "2024-05-01";

public DiagnosticEvent() { }

public DiagnosticEvent(string hostId, DateTime timestamp)
{
RowKey = TableStorageHelpers.GetRowKey(timestamp);
PartitionKey = $"{hostId}-{timestamp:yyyyMMdd}";
Timestamp = timestamp;
EventVersion = CurrentEventVersion;
}

public string EventVersion { get; set; }

public int HitCount { get; set; }

public string Message { get; set; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Azure.Cosmos.Table;
using Microsoft.Azure.WebJobs.Host.Executors;
using Microsoft.Azure.WebJobs.Hosting;
using Microsoft.Azure.WebJobs.Logging;
using Microsoft.Azure.WebJobs.Script.WebHost.Helpers;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;

namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics
Expand All @@ -25,26 +29,31 @@ public class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository,
private readonly IHostIdProvider _hostIdProvider;
private readonly IEnvironment _environment;
private readonly ILogger<DiagnosticEventTableStorageRepository> _logger;
private readonly IServiceProvider _serviceProvider;
private readonly object _syncLock = new object();

private ConcurrentDictionary<string, DiagnosticEvent> _events = new ConcurrentDictionary<string, DiagnosticEvent>();
private CloudTableClient _tableClient;
private CloudTable _diagnosticEventsTable;
private string _hostId;
private bool _disposed = false;
private bool _purged = false;
private string _tableName;

internal DiagnosticEventTableStorageRepository(IConfiguration configuration, IHostIdProvider hostIdProvider, IEnvironment environment, ILogger<DiagnosticEventTableStorageRepository> logger, int logFlushInterval)
internal DiagnosticEventTableStorageRepository(IConfiguration configuration, IHostIdProvider hostIdProvider, IEnvironment environment, IScriptHostManager scriptHostManager,
ILogger<DiagnosticEventTableStorageRepository> logger, int logFlushInterval)
{
_configuration = configuration;
_hostIdProvider = hostIdProvider;
_environment = environment;
_serviceProvider = scriptHostManager as IServiceProvider;
_logger = logger;
_flushLogsTimer = new Timer(OnFlushLogs, null, logFlushInterval, logFlushInterval);
}

public DiagnosticEventTableStorageRepository(IConfiguration configuration, IHostIdProvider hostIdProvider, IEnvironment environment, ILogger<DiagnosticEventTableStorageRepository> logger)
: this(configuration, hostIdProvider, environment, logger, LogFlushInterval) { }
public DiagnosticEventTableStorageRepository(IConfiguration configuration, IHostIdProvider hostIdProvider, IEnvironment environment, IScriptHostManager scriptHost,
ILogger<DiagnosticEventTableStorageRepository> logger)
: this(configuration, hostIdProvider, environment, scriptHost, logger, LogFlushInterval) { }

internal CloudTableClient TableClient
{
Expand Down Expand Up @@ -88,7 +97,7 @@ internal CloudTable GetDiagnosticEventsTable(DateTime? now = null)
if (TableClient != null)
{
now = now ?? DateTime.UtcNow;
string currentTableName = GetCurrentTableName(now.Value);
string currentTableName = GetTableName(now.Value);

// update the table reference when date rolls over to a new month
if (_diagnosticEventsTable == null || currentTableName != _tableName)
Expand All @@ -101,43 +110,106 @@ internal CloudTable GetDiagnosticEventsTable(DateTime? now = null)
return _diagnosticEventsTable;
}

private static string GetCurrentTableName(DateTime now)
private static string GetTableName(DateTime date)
{
return $"{TableNamePrefix}{now:yyyyMM}";
return $"{TableNamePrefix}{date:yyyyMM}";
}

protected internal virtual async void OnFlushLogs(object state)
{
await FlushLogs();
}

private async Task PurgePreviousEventVersions()
{
_logger.LogDebug("Purging diagnostic events with versions older than '{currentEventVersion}'.", DiagnosticEvent.CurrentEventVersion);

bool tableDeleted = false;

await Utility.InvokeWithRetriesAsync(async () =>
{
try
{
var tables = (await TableStorageHelpers.ListTablesAsync(TableClient, TableNamePrefix)).ToList();
foreach (var table in tables)
{
var tableRecords = await table.ExecuteQuerySegmentedAsync(new TableQuery<DiagnosticEvent>(), null);
// Skip tables that have 0 records
if (tableRecords.Results.Count == 0)
{
continue;
}
// Delete table if it doesn't have records with EventVersion
var eventVersionDoesNotExists = tableRecords.Results.Any(record => string.IsNullOrEmpty(record.EventVersion) == true);
if (eventVersionDoesNotExists)
{
_logger.LogDebug("Deleting table '{tableName}' as it contains records without an EventVersion.", table.Name);
await table.DeleteIfExistsAsync();
tableDeleted = true;
continue;
}
// If the table does have EventVersion, query if it is an outdated version
var eventVersionOutdated = tableRecords.Results.Any(record => string.Compare(DiagnosticEvent.CurrentEventVersion, record.EventVersion, StringComparison.Ordinal) > 0);
if (eventVersionOutdated)
{
_logger.LogDebug("Deleting table '{tableName}' as it contains records with an outdated EventVersion.", table.Name);
await table.DeleteIfExistsAsync();
tableDeleted = true;
}
}
_purged = true;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error occurred when attempting to purge previous diagnostic event versions.");
}
}, maxRetries: 5, retryInterval: TimeSpan.FromSeconds(5));

if (tableDeleted)
{
// Wait for 30 seconds to allow the table to be deleted before proceeding to avoid a potential race.
await Task.Delay(TimeSpan.FromSeconds(30));
}
}

internal virtual async Task FlushLogs(CloudTable table = null)
{
if (_environment.IsPlaceholderModeEnabled())
{
return;
}

if (IsPrimaryHost() && !_purged)
{
await PurgePreviousEventVersions();
}

try
{
table = table ?? GetDiagnosticEventsTable();

if (table == null)
{
_logger.LogError("Unable to get table reference. Aborting write operation");
_logger.LogError("Unable to get table reference. Aborting write operation.");
StopTimer();
return;
}

bool tableCreated = await TableStorageHelpers.CreateIfNotExistsAsync(table, TableCreationMaxRetryCount);
if (tableCreated)
{
_logger.LogDebug("Queueing background table purge.");
TableStorageHelpers.QueueBackgroundTablePurge(table, TableClient, TableNamePrefix, _logger);
}
}
catch (Exception ex)
{
_logger.LogError(ex, $"Unable to get table reference or create table. Aborting write operation.");
_logger.LogError(ex, "Unable to get table reference or create table. Aborting write operation.");
// Clearing the memory cache to avoid memory build up.
_events.Clear();
return;
Expand Down Expand Up @@ -169,9 +241,9 @@ internal async Task ExecuteBatchAsync(ConcurrentDictionary<string, DiagnosticEve
await table.ExecuteBatchAsync(batch);
events.Clear();
}
catch (Exception e)
catch (Exception ex)
{
_logger.LogError(e, $"Unable to write diagnostic events to table storage:{e}");
_logger.LogError(ex, "Unable to write diagnostic events to table storage.");
}
}

Expand Down Expand Up @@ -202,9 +274,21 @@ public void WriteDiagnosticEvent(DateTime timestamp, string errorCode, LogLevel
}
}

private bool IsPrimaryHost()
{
var primaryHostStateProvider = _serviceProvider?.GetService<IPrimaryHostStateProvider>();
if (primaryHostStateProvider is null)
{
_logger.LogDebug("PrimaryHostStateProvider is not available. Skipping the check for primary host.");
return false;
}

return primaryHostStateProvider.IsPrimary;
}

private void StopTimer()
{
_logger.LogInformation("Stopping the flush logs timer");
_logger.LogInformation("Stopping the flush logs timer.");
_flushLogsTimer?.Change(Timeout.Infinite, Timeout.Infinite);
}

Expand Down
Loading

0 comments on commit e07dacb

Please sign in to comment.