From a3b8dd100d3205de69535ae075387350be53299f Mon Sep 17 00:00:00 2001 From: "Michael C. Fanning" Date: Sat, 8 Oct 2022 17:33:03 -0700 Subject: [PATCH] Resolve 'collection modified' exception when logging results (#2549) * Introduce slim semaphore to enable per-file-hash results caching. * Latest iteration of concurrency fix for 'collection modified' exception. * Improvements to concurrency change. * Skip Coyote testing for now. * Add release note and incidental PR clean-up. * Final PR clean-up. * Null hash data when not present. --- src/ReleaseHistory.md | 2 + .../Sdk/MultithreadedAnalyzeCommandBase.cs | 124 +++++++++--------- src/Sarif/Core/Artifact.cs | 38 +++++- src/Sarif/HashUtilities.cs | 12 ++ src/Sarif/Writers/CachingLogger.cs | 19 +++ .../Sdk/AnalyzeCommandBaseTests.cs | 46 ++++--- .../Sdk/FormatForVisualStudioTests.cs | 1 - .../Sdk/SkimmerBaseTests.cs | 4 - .../Sdk/TestSkimmerBase.cs | 3 - .../TestUtilitiesExtensions.cs | 1 - 10 files changed, 162 insertions(+), 88 deletions(-) diff --git a/src/ReleaseHistory.md b/src/ReleaseHistory.md index 95911ee11..441316966 100644 --- a/src/ReleaseHistory.md +++ b/src/ReleaseHistory.md @@ -1,6 +1,8 @@ # SARIF Package Release History (SDK, Driver, Converters, and Multitool) ## Unreleased +* BUGFIX: Another attempt to resolve 'InvalidOperationException' with message `Collection was modified; enumeration operation may not execute` in `MultithreadedAnalyzeCommandBase`, raised when analyzing with the `--hashes` switch. [#2459](https://github.com/microsoft/sarif-sdk/pull/2549). There was a previous attempt to fix this in [#2447](https://github.com/microsoft/sarif-sdk/pull/2447). + * FEATURE: Allow initialization of file regions cache in `InsertOptionalDataVisitor` (previously initialized exclusively from `FileRegionsCache.Instance`). * BUGFIX: Resolve issue where `match-results-forward` command fails to generate VersionControlDetails data. [#2487](https://github.com/microsoft/sarif-sdk/pull/2487) * BUGFIX: Remove duplicated rule definitions when executing `match-results-forward` commands for results with sub-rule ids. [#2486](https://github.com/microsoft/sarif-sdk/pull/2486) diff --git a/src/Sarif.Driver/Sdk/MultithreadedAnalyzeCommandBase.cs b/src/Sarif.Driver/Sdk/MultithreadedAnalyzeCommandBase.cs index b03e2002d..476c6a8ef 100644 --- a/src/Sarif.Driver/Sdk/MultithreadedAnalyzeCommandBase.cs +++ b/src/Sarif.Driver/Sdk/MultithreadedAnalyzeCommandBase.cs @@ -28,17 +28,16 @@ public abstract class MultithreadedAnalyzeCommandBase : Plug internal bool _captureConsoleOutput; internal ConsoleLogger _consoleLogger; - internal ConcurrentDictionary _analysisLoggerCache; private Run _run; private Tool _tool; private bool _computeHashes; internal TContext _rootContext; private int _fileContextsCount; - private Channel _hashChannel; + private Channel readyToHashChannel; private OptionallyEmittedData _dataToInsert; private Channel _resultsWritingChannel; - private Channel _fileEnumerationChannel; + private Channel readyToScanChannel; private IDictionary _pathToHashDataMap; private ConcurrentDictionary _fileContexts; @@ -190,12 +189,18 @@ private void MultithreadedAnalyzeTargets(TOptions options, : (Debugger.IsAttached) ? 1 : Environment.ProcessorCount; var channelOptions = new BoundedChannelOptions(2000) + { + SingleWriter = true, + SingleReader = true, + }; + readyToHashChannel = Channel.CreateBounded(channelOptions); + + channelOptions = new BoundedChannelOptions(2000) { SingleWriter = true, SingleReader = false, }; - _fileEnumerationChannel = Channel.CreateBounded(channelOptions); - _hashChannel = Channel.CreateBounded(channelOptions); + readyToScanChannel = Channel.CreateBounded(channelOptions); channelOptions = new BoundedChannelOptions(2000) { @@ -206,26 +211,39 @@ private void MultithreadedAnalyzeTargets(TOptions options, var sw = Stopwatch.StartNew(); - var workers = new Task[options.Threads]; + // 1: First we initiate an asynchronous operation to locate disk files for + // analysis, as specified in analysis configuration (file names, wildcards). + Task enumerateFilesOnDisk = EnumerateFilesOnDiskAsync(options, rootContext); + // 2: Files found on disk are put in a specific sort order, after which a + // reference to each scan target is put into a channel for hashing, + // if hashing is enabled. + Task hashFilesAndPutInAnalysisQueue = HashFilesAndPutInAnalysisQueueAsnc(); + + // 3: A dedicated set of threads pull scan targets and analyze them. + // On completing a scan, the thread writes the index of the + // scanned item to a channel that drives logging. + var workers = new Task[options.Threads]; for (int i = 0; i < options.Threads; i++) { - workers[i] = AnalyzeTargetAsync(skimmers, disabledSkimmers); + workers[i] = ScanTargetsAsync(skimmers, disabledSkimmers); } - Task hashFiles = HashAsync(); - Task findFiles = FindFilesAsync(options, rootContext); - Task writeResults = WriteResultsAsync(rootContext); - - // FindFiles is single-thread and will close its write channel - findFiles.Wait(); - hashFiles.Wait(); + // 4: A single-threaded consumer watches for completed scans + // and logs results, if any. This operation is singlle-threaded + // in order to help ensure determinism in log output. i.e., any + // scan of the same targets using the same production code + // should produce a log file that is byte-for-byte identical + // to the previous output. + Task logScanResults = LogScanResultsAsync(rootContext); Task.WhenAll(workers) .ContinueWith(_ => _resultsWritingChannel.Writer.Complete()) .Wait(); - writeResults.Wait(); + enumerateFilesOnDisk.Wait(); + hashFilesAndPutInAnalysisQueue.Wait(); + logScanResults.Wait(); Console.WriteLine(); @@ -239,7 +257,7 @@ private void MultithreadedAnalyzeTargets(TOptions options, } } - private async Task WriteResultsAsync(TContext rootContext) + private async Task LogScanResultsAsync(TContext rootContext) { int currentIndex = 0; @@ -265,15 +283,7 @@ private async Task WriteResultsAsync(TContext rootContext) while (context?.AnalysisComplete == true) { - if (_computeHashes) - { - bool cache = _analysisLoggerCache.TryGetValue(context.Hashes.Sha256, out IAnalysisLogger logger); - LogCachingLogger(rootContext, logger ?? context.Logger, context, clone: cache); - } - else - { - LogCachingLogger(rootContext, context.Logger, context); - } + LogCachingLogger(rootContext, context, clone: _computeHashes); RuntimeErrors |= context.RuntimeErrors; @@ -300,9 +310,9 @@ private async Task WriteResultsAsync(TContext rootContext) return true; } - private void LogCachingLogger(TContext rootContext, IAnalysisLogger logger, TContext context, bool clone = false) + private void LogCachingLogger(TContext rootContext, TContext context, bool clone = false) { - var cachingLogger = (CachingLogger)logger; + var cachingLogger = (CachingLogger)context.Logger; IDictionary> results = cachingLogger.Results; if (results?.Count > 0) @@ -352,7 +362,7 @@ private void LogCachingLogger(TContext rootContext, IAnalysisLogger logger, TCon } } - private async Task FindFilesAsync(TOptions options, TContext rootContext) + private async Task EnumerateFilesOnDiskAsync(TOptions options, TContext rootContext) { this._fileContextsCount = 0; this._fileContexts = new ConcurrentDictionary(); @@ -432,12 +442,12 @@ private async Task FindFilesAsync(TOptions options, TContext rootContext) filePath: file) ); - await _hashChannel.Writer.WriteAsync(_fileContextsCount++); + await readyToHashChannel.Writer.WriteAsync(_fileContextsCount++); } } } - _hashChannel.Writer.Complete(); + readyToHashChannel.Writer.Complete(); if (_fileContextsCount == 0) { @@ -464,9 +474,11 @@ private void EnqueueAllDirectories(Queue queue, string directory) } } - private async Task HashAsync() + private async Task HashFilesAndPutInAnalysisQueueAsnc() { - ChannelReader reader = _hashChannel.Reader; + ChannelReader reader = readyToHashChannel.Reader; + + Dictionary loggerCache = null; // Wait until there is work or the channel is closed. while (await reader.WaitToReadAsync()) @@ -487,20 +499,28 @@ private async Task HashAsync() { _pathToHashDataMap.Add(localPath, hashData); } + + loggerCache ??= new Dictionary(); + + if (!loggerCache.TryGetValue(hashData.Sha256, out CachingLogger logger)) + { + logger = loggerCache[hashData.Sha256] = (CachingLogger)context.Logger; + } + context.Logger = logger; } - await _fileEnumerationChannel.Writer.WriteAsync(index); + await readyToScanChannel.Writer.WriteAsync(index); } } - _fileEnumerationChannel.Writer.Complete(); + readyToScanChannel.Writer.Complete(); return true; } - private async Task AnalyzeTargetAsync(IEnumerable> skimmers, ISet disabledSkimmers) + private async Task ScanTargetsAsync(IEnumerable> skimmers, ISet disabledSkimmers) { - ChannelReader reader = _fileEnumerationChannel.Reader; + ChannelReader reader = readyToScanChannel.Reader; // Wait until there is work or the channel is closed. while (await reader.WaitToReadAsync()) @@ -532,9 +552,6 @@ private async Task AnalyzeTargetAsync(IEnumerable> skimm protected virtual void ValidateOptions(TOptions options, TContext context) { - _dataToInsert = options.DataToInsert.ToFlags(); - _computeHashes = (_dataToInsert & OptionallyEmittedData.Hashes) != 0; - bool succeeded = true; succeeded &= ValidateFile(context, options.OutputFilePath, DefaultPolicyName, shouldExist: null); @@ -564,10 +581,8 @@ internal AggregatingLogger InitializeLogger(AnalyzeOptionsBase analyzeOptions) logger.Loggers.Add(_consoleLogger); } - if ((analyzeOptions.DataToInsert.ToFlags() & OptionallyEmittedData.Hashes) != 0) - { - _analysisLoggerCache = new ConcurrentDictionary(); - } + _dataToInsert = analyzeOptions.DataToInsert.ToFlags(); + _computeHashes = (_dataToInsert & OptionallyEmittedData.Hashes) != 0; return logger; } @@ -868,29 +883,20 @@ protected virtual TContext DetermineApplicabilityAndAnalyze( return context; } - IAnalysisLogger logger = context.Logger; - - if (_computeHashes) - { - if (_analysisLoggerCache.ContainsKey(context.Hashes.Sha256)) - { - return context; - } - } - - context.Logger.AnalyzingTarget(context); + CachingLogger logger = (CachingLogger)context.Logger; + logger.AnalyzingTarget(context); - if (_computeHashes) + if (logger.CacheFinalized) { - if (!_analysisLoggerCache.TryAdd(context.Hashes.Sha256, logger)) - { - return context; - } + context.Logger = logger; + logger.ReleaseLock(); + return context; } IEnumerable> applicableSkimmers = DetermineApplicabilityForTarget(context, skimmers, disabledSkimmers); AnalyzeTarget(context, applicableSkimmers, disabledSkimmers); + logger.ReleaseLock(); return context; } diff --git a/src/Sarif/Core/Artifact.cs b/src/Sarif/Core/Artifact.cs index a94c3f866..5f729a6bc 100644 --- a/src/Sarif/Core/Artifact.cs +++ b/src/Sarif/Core/Artifact.cs @@ -29,6 +29,7 @@ public static Artifact Create( var artifact = new Artifact() { Encoding = encoding?.WebName, + Hashes = hashData != null ? CreateHashesDictionary(hashData) : null, }; string mimeType = SarifWriters.MimeType.DetermineFromFileExtension(uri); @@ -70,12 +71,17 @@ public static Artifact Create( if (dataToInsert.HasFlag(OptionallyEmittedData.Hashes)) { HashData hashes = hashData ?? HashUtilities.ComputeHashes(filePath); - artifact.Hashes = new Dictionary + + // The hash utilities will return null data in some text contexts. + if (hashes != null) { - { "md5", hashes.MD5 }, - { "sha-1", hashes.Sha1 }, - { "sha-256", hashes.Sha256 }, - }; + artifact.Hashes = new Dictionary + { + { "md5", hashes.MD5 }, + { "sha-1", hashes.Sha1 }, + { "sha-256", hashes.Sha256 }, + }; + } } } catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) { } @@ -83,6 +89,28 @@ public static Artifact Create( return artifact; } + private static IDictionary CreateHashesDictionary(HashData hashData) + { + var result = new Dictionary(); + + if (!string.IsNullOrEmpty(hashData?.MD5)) + { + result["md5"] = hashData?.MD5; + } + + if (!string.IsNullOrEmpty(hashData?.Sha1)) + { + result["sha-1"] = hashData?.Sha1; + } + + if (!string.IsNullOrEmpty(hashData?.Sha256)) + { + result["sha-256"] = hashData?.Sha256; + } + + return result; + } + private static ArtifactContent GetEncodedFileContents(IFileSystem fileSystem, string filePath, string mimeType, Encoding inputFileEncoding) { var fileContent = new ArtifactContent(); diff --git a/src/Sarif/HashUtilities.cs b/src/Sarif/HashUtilities.cs index 3e8ca9434..7188ecb47 100644 --- a/src/Sarif/HashUtilities.cs +++ b/src/Sarif/HashUtilities.cs @@ -77,6 +77,18 @@ public static HashData ComputeHashes(string fileName, IFileSystem fileSystem = n { using (Stream stream = fileSystem.FileOpenRead(fileName)) { + // This condition is actually only feasible in testing, as a null + // value will only be returned by a mock object that doesn't + // recognize the current specified file argument. In production, + // an exception will always be raised for a missing file. If + // we enter the code below, that indicates that a test + // encountered an adverse condition and is attempting to produce + // a file hash for some source file in a notification stack. We + // return null here, as the actual source hash isn't interesting + // for this scenario, and we want to reliably finish test execution + // and record exception details. + if (stream == null) { return null; } + using (var bufferedStream = new BufferedStream(stream, 1024 * 32)) { string md5, sha1, sha256; diff --git a/src/Sarif/Writers/CachingLogger.cs b/src/Sarif/Writers/CachingLogger.cs index cbc32408a..a99a39464 100644 --- a/src/Sarif/Writers/CachingLogger.cs +++ b/src/Sarif/Writers/CachingLogger.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Threading; namespace Microsoft.CodeAnalysis.Sarif.Writers { @@ -16,6 +17,9 @@ public class CachingLogger : BaseLogger, IAnalysisLogger { public CachingLogger(IEnumerable levels, IEnumerable kinds) : base(levels, kinds) { + // This reader lock is used to ensure only a single writer until + // logging is complete, after which all threads can read Results. + _semaphore = new SemaphoreSlim(initialCount: 1, maxCount: 1); } public IDictionary> Results { get; set; } @@ -24,6 +28,14 @@ public CachingLogger(IEnumerable levels, IEnumerable k public IList ToolNotifications { get; set; } + /// + /// Gets or sets a boolean value that indicates whether the Results + /// object associated with this logger is fixed and ready to replay. + /// + public bool CacheFinalized { get; private set; } + + private readonly SemaphoreSlim _semaphore; + public void AnalysisStarted() { } @@ -34,6 +46,7 @@ public void AnalysisStopped(RuntimeConditions runtimeConditions) public void AnalyzingTarget(IAnalysisContext context) { + _semaphore.Wait(); } public void Log(ReportingDescriptor rule, Result result) @@ -93,5 +106,11 @@ public void LogToolNotification(Notification notification) ToolNotifications ??= new List(); ToolNotifications.Add(notification); } + + public void ReleaseLock() + { + CacheFinalized = true; + _semaphore.Release(); + } } } diff --git a/src/Test.UnitTests.Sarif.Driver/Sdk/AnalyzeCommandBaseTests.cs b/src/Test.UnitTests.Sarif.Driver/Sdk/AnalyzeCommandBaseTests.cs index 39e42da3f..b028aa9b6 100644 --- a/src/Test.UnitTests.Sarif.Driver/Sdk/AnalyzeCommandBaseTests.cs +++ b/src/Test.UnitTests.Sarif.Driver/Sdk/AnalyzeCommandBaseTests.cs @@ -1444,7 +1444,7 @@ public void AnalyzeCommandBase_AutomationDetailsTests() } } - [Fact(Timeout = 5000)] + [Fact(Timeout = 5000, Skip = "TBD: this Coyote test will be enabled in a future nightly pipeline test run.")] public void AnalyzeCommandBase_ShouldGenerateSameResultsWhenRunningSingleAndMultithreaded_CoyoteTest() { Configuration config = Configuration.Create().WithTestingIterations(100).WithSystematicFuzzingEnabled(); @@ -1630,18 +1630,26 @@ private void AnalyzeScenarios(int[] scenarios) public void AnalyzeCommandBase_MultithreadedShouldUseCacheIfFilesAreTheSame() { // Generating 20 files with different names but same content. + // Generally, we expect the test analyzer to produce a result + // based on the file name. Because every file is a duplicate + // of every other in this case, though, we expect to see a + // result for every file, because the first one analyzed + // produces a result and therefore every identical file (by + // file hash, not by file name) will also produce that result. RunMultithreadedAnalyzeCommand(ComprehensiveKindAndLevelsByFilePath, - generateSameOutput: true, + generateDuplicateScanTargets: true, expectedResultCode: 0, - expectedResultCount: 20, - expectedCacheSize: 1); + expectedResultCount: 20); // Generating 20 files with different names and content. + // For this case, our expected result count matches the default + // behavior of the test analyzer and our default analyzer settings. + // By default, our analysis produces output for errors and warnings + // and there happen to be 7 files that comprises these failure levels. RunMultithreadedAnalyzeCommand(ComprehensiveKindAndLevelsByFilePath, - generateSameOutput: false, + generateDuplicateScanTargets: false, expectedResultCode: 0, - expectedResultCount: 7, - expectedCacheSize: 20); + expectedResultCount: 7); } private static readonly IList ComprehensiveKindAndLevelsByFileName = new List @@ -1801,8 +1809,8 @@ private static void EnhanceOptions(TestAnalyzeOptions current, TestAnalyzeOption private static IFileSystem CreateDefaultFileSystemForResultsCaching(IList files, bool generateSameInput = false) { - // This helper creates a file system that returns the same file contents for - // every file passed in the 'files' argument. + // This helper creates a file system that generates unique or entirely + // duplicate content for every file passed in the 'files' argument. string logFileContents = Guid.NewGuid().ToString(); @@ -1895,16 +1903,15 @@ private static SarifLog RunAnalyzeCommand(TestAnalyzeOptions options, } private static void RunMultithreadedAnalyzeCommand(IList files, - bool generateSameOutput, + bool generateDuplicateScanTargets, int expectedResultCode, - int expectedResultCount, - int expectedCacheSize) + int expectedResultCount) { var testCase = new ResultsCachingTestCase { Files = files, PersistLogFileToDisk = true, - FileSystem = CreateDefaultFileSystemForResultsCaching(files, generateSameOutput) + FileSystem = CreateDefaultFileSystemForResultsCaching(files, generateDuplicateScanTargets) }; var options = new TestAnalyzeOptions @@ -1926,11 +1933,20 @@ private static void RunMultithreadedAnalyzeCommand(IList files, }; HashUtilities.FileSystem = testCase.FileSystem; - command.Run(options).Should().Be(expectedResultCode); + int result = command.Run(options); + result.Should().Be(expectedResultCode); SarifLog sarifLog = JsonConvert.DeserializeObject(File.ReadAllText(options.OutputFilePath)); sarifLog.Runs[0].Results.Count.Should().Be(expectedResultCount); - command._analysisLoggerCache.Count.Should().Be(expectedCacheSize); + + HashSet hashes = new HashSet(); + foreach (Artifact artifact in sarifLog.Runs[0].Artifacts) + { + hashes.Add(artifact.Hashes["sha-256"]); + } + + int expectedUniqueFileHashCount = generateDuplicateScanTargets ? 1 : expectedResultCount; + hashes.Count.Should().Be(expectedUniqueFileHashCount); } finally { diff --git a/src/Test.UnitTests.Sarif.Driver/Sdk/FormatForVisualStudioTests.cs b/src/Test.UnitTests.Sarif.Driver/Sdk/FormatForVisualStudioTests.cs index 06848417d..e54f40cde 100644 --- a/src/Test.UnitTests.Sarif.Driver/Sdk/FormatForVisualStudioTests.cs +++ b/src/Test.UnitTests.Sarif.Driver/Sdk/FormatForVisualStudioTests.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for full license information. -using System; using System.Collections.Generic; using FluentAssertions; diff --git a/src/Test.UnitTests.Sarif.Driver/Sdk/SkimmerBaseTests.cs b/src/Test.UnitTests.Sarif.Driver/Sdk/SkimmerBaseTests.cs index dfbc7c25c..c5d2eaadf 100644 --- a/src/Test.UnitTests.Sarif.Driver/Sdk/SkimmerBaseTests.cs +++ b/src/Test.UnitTests.Sarif.Driver/Sdk/SkimmerBaseTests.cs @@ -1,12 +1,8 @@ // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for full license information. -using System.Linq; - using FluentAssertions; -using Microsoft.CodeAnalysis.Sarif; - using Xunit; namespace Microsoft.CodeAnalysis.Sarif.Driver.Sdk diff --git a/src/Test.UnitTests.Sarif.Driver/Sdk/TestSkimmerBase.cs b/src/Test.UnitTests.Sarif.Driver/Sdk/TestSkimmerBase.cs index 7325901c7..5d0994b7d 100644 --- a/src/Test.UnitTests.Sarif.Driver/Sdk/TestSkimmerBase.cs +++ b/src/Test.UnitTests.Sarif.Driver/Sdk/TestSkimmerBase.cs @@ -5,9 +5,6 @@ using System.Collections.Generic; using System.Resources; -using Microsoft.CodeAnalysis.Sarif; -using Microsoft.CodeAnalysis.Sarif.Driver; - namespace Microsoft.CodeAnalysis.Sarif.Driver { // This class overrides all the abstract members of SkimmerBase. As a result, diff --git a/src/Test.Utilities.Sarif/TestUtilitiesExtensions.cs b/src/Test.Utilities.Sarif/TestUtilitiesExtensions.cs index 849a2d9f4..93a230995 100644 --- a/src/Test.Utilities.Sarif/TestUtilitiesExtensions.cs +++ b/src/Test.Utilities.Sarif/TestUtilitiesExtensions.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Runtime.CompilerServices; using Microsoft.CodeAnalysis.Sarif;