Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SARIF has per-line rolling (partial) hash support #2605

Merged
merged 23 commits into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cda8b9e
starting rolling hash implementation: added computeFirstMod and data …
suvamM Dec 17, 2022
b83c52e
starting rolling hash implementation: added computeFirstMod and data …
suvamM Dec 17, 2022
ed02fef
Merge branch 'users/suvam/rolling-hash' of https://github.com/microso…
suvamM Jan 10, 2023
ddb0a20
[wip] added hashing algorithm to file regions
suvamM Jan 12, 2023
69ec96e
[wip] Fixes to the rolling hash computation
suvamM Jan 17, 2023
8518dbe
[wip] fixes to the hashing algorithm.
suvamM Jan 17, 2023
95b3a0b
[wip] Moving hash computation to HashUtilities
suvamM Jan 18, 2023
96a4d57
Porting tests from CodeQL repo
suvamM Jan 21, 2023
b7c9fd3
Adding unit tests for rolling hash
suvamM Jan 23, 2023
b974cf9
Adding comments
suvamM Jan 23, 2023
77b5297
[wip] added hashing algorithm to file regions
suvamM Jan 12, 2023
822c0bb
[wip] Fixes to the rolling hash computation
suvamM Jan 17, 2023
b34980d
[wip] fixes to the hashing algorithm.
suvamM Jan 17, 2023
e52643a
[wip] Moving hash computation to HashUtilities
suvamM Jan 18, 2023
9b31212
Porting tests from CodeQL repo
suvamM Jan 21, 2023
095df5d
Adding unit tests for rolling hash
suvamM Jan 23, 2023
6cd2eec
Adding comments
suvamM Jan 23, 2023
983f8e5
Merge branch 'users/suvam/rolling-hash' of https://github.com/microso…
suvamM Jan 23, 2023
b1169f9
removing generics in file regions cache
suvamM Jan 23, 2023
b64c8c9
updating Release History
suvamM Jan 23, 2023
060174b
Merge branch 'main' into users/suvam/rolling-hash
suvamM Jan 25, 2023
a674a0c
incorporating PR feedback
suvamM Jan 25, 2023
573291a
format fixing Long
suvamM Jan 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion NuGet.Config
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<packageSources>
<clear />
Expand Down
195 changes: 195 additions & 0 deletions src/Sarif/FileRegionsCache.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;

Expand All @@ -18,6 +19,15 @@ public class FileRegionsCache
public const int DefaultCacheCapacity = 100;
private readonly IFileSystem _fileSystem;
internal readonly Cache<string, Tuple<string, NewLineIndex>> _cache;
internal readonly Dictionary<string, Dictionary<int, string>> _rollingHashesCache;

private static readonly int tab = (int)"\t"[0];
suvamM marked this conversation as resolved.
Show resolved Hide resolved
suvamM marked this conversation as resolved.
Show resolved Hide resolved
private static readonly int space = (int)" "[0];
private static readonly int lf = (int)"\n"[0];
private static readonly int cr = (int)"\r"[0];
private static readonly int EOF = 65535;
private static readonly int BLOCK_SIZE = 5;
private static readonly long MOD = (long)37;

/// <summary>
/// Creates a new <see cref="FileRegionsCache"/> object.
Expand All @@ -34,6 +44,9 @@ public FileRegionsCache(int capacity = DefaultCacheCapacity, IFileSystem fileSys

// Build a cache for this data, with the load method it should use to add new entries
_cache = new Cache<string, Tuple<string, NewLineIndex>>(BuildIndexForFile, capacity);

// Build a cache of rolling hashes (partial hash per line) for each artifact
_rollingHashesCache = new Dictionary<string, Dictionary<int, string>>();
}

/// <summary>
Expand Down Expand Up @@ -92,6 +105,158 @@ public void ClearCache()
this._cache.Clear();
}

public void SuvamTest(
suvamM marked this conversation as resolved.
Show resolved Hide resolved
Uri uri)
{
Hash(uri);
}

private long ComputeFirstMod()
{
long firstMod = (long)1;
for (int i = 0; i < BLOCK_SIZE; i++)
{
firstMod = firstMod * MOD;
}
return firstMod;
}

private void Hash(Uri uri)
{
string filePath = uri.GetFilePath();

if (!_rollingHashesCache.ContainsKey(filePath))
{
_rollingHashesCache.Add(filePath,new Dictionary<int, string>());
}

// Check if we have already computed the rolling hashes for this file.
if (_rollingHashesCache[filePath].Count > 0)
{
return;
}

// A rolling view into the input
int[] window = new int[BLOCK_SIZE];

int[] lineNumbers = new int[BLOCK_SIZE];
for (int i = 0; i < lineNumbers.Length; i++)
{
lineNumbers[i] = -1;
}

long hashRaw = (long)0;
long firstMod = ComputeFirstMod();

// The current index in the window, will wrap around to zero when we reach BLOCK_SIZE
int index = 0;
// The line number of the character we are currently processing from the input
int lineNumber = 0;
// Is the next character to be read the start of a new line
bool lineStart = true;
// Was the previous character a CR (carriage return)
bool prevCR = false;

Dictionary<string, int> hashCounts = new Dictionary<string, int>();

// Output the current hash and line number to the cache
Action outputHash = () =>
{
ulong uhashRaw = (ulong)hashRaw;
string hashValue = uhashRaw.ToString("x16");

if (!hashCounts.ContainsKey(hashValue))
{
hashCounts[hashValue] = 0;
}

hashCounts[hashValue]++;
_rollingHashesCache[filePath][lineNumbers[index]] = $"{hashValue}:{hashCounts[hashValue]}";
lineNumbers[index] = -1;
};

// Update the current hash value and increment the index in the window
Action<int> updateHash = (current) =>
{
int begin = window[index];
window[index] = current;

hashRaw = (MOD * hashRaw) + (long)current - (firstMod * (long)begin);

index = (index + 1) % BLOCK_SIZE;
};

// First process every character in the input, updating the hash and lineNumbers
// as we go. Once we reach a point in the window again then we've processed
// BLOCK_SIZE characters and if the last character at this point in the window
// was the start of a line then we should output the hash for that line.
Action<int> processCharacter = (current) =>
{
// skip tabs, spaces, and line feeds that come directly after a carriage return
if (current == space || current == tab || (prevCR && current == lf))
{
prevCR = false;
return;
}
// replace CR with LF
if (current == cr)
{
current = lf;
prevCR = true;
}
else
{
prevCR = false;
}
if (lineNumbers[index] != -1)
{
outputHash();
}
if (lineStart)
{
lineStart = false;
lineNumber++;
lineNumbers[index] = lineNumber;
}
if (current == lf)
{
lineStart = true;
}
updateHash(current);
};

string fileText = null;
try
{
if (_fileSystem.FileExists(filePath))
{
fileText = _fileSystem.FileReadAllText(filePath);
}
}
catch (IOException) { }
Fixed Show fixed Hide fixed
suvamM marked this conversation as resolved.
Show resolved Hide resolved

if (fileText != null)
{
for (int i = 0; i < fileText.Length; i++)
{
processCharacter(fileText[i]);
}

processCharacter(EOF);

// Flush the remaining lines
for (int i = 0; i < BLOCK_SIZE; i++)
{
if (lineNumbers[index] != -1)
{
outputHash();
}
updateHash(0);
}
}
}


private Region PopulateTextRegionProperties(NewLineIndex lineIndex, Region inputRegion, string fileText, bool populateSnippet)
{
// A GENERAL NOTE ON THE PROPERTY POPULATION PROCESS:
Expand Down Expand Up @@ -402,6 +567,36 @@ private Tuple<string, NewLineIndex> BuildIndexForFile(string localPath)
return new Tuple<string, NewLineIndex>(fileText, index);
}

/// <summary>
/// Method to build cache entries which aren't already in the cache.
/// </summary>
/// <param name="localPath">Uri.LocalPath for the file to load</param>
/// <returns>Initialize a file in the cache with empty rolling hash index.</returns>
private Dictionary<int, string> BuildRollingHashIndexForFile(string localPath)
{
string fileText = null;
NewLineIndex index = null;

// We will expand this code later to construct all possible URLs from
// the log file, bearing in mind things like uriBaseIds. Also, we could
// consider downloading and caching web-hosted source files.
try
{
if (_fileSystem.FileExists(localPath))
{
fileText = _fileSystem.FileReadAllText(localPath);
}
}
catch (IOException) { }
Fixed Show fixed Hide fixed

if (fileText != null)
{
index = new NewLineIndex(fileText);
Fixed Show fixed Hide fixed
}

return new Dictionary<int, string>();
}

private static void Assert(bool condition)
{
// Placeholder to report issues in a situationally appropriate way.
Expand Down
16 changes: 16 additions & 0 deletions src/Test.UnitTests.Sarif/FileRegionsCacheTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -645,5 +645,21 @@ public void FileRegionsCache_PopulatesWithOneLine_Everything()
multilineRegion.CharLength.Should().Be(content.Length);
multilineRegion.Snippet.Text.Should().Be(content);
}

// Placeholder for API Test
[Fact]
suvamM marked this conversation as resolved.
Show resolved Hide resolved
public void SuvamTest()
suvamM marked this conversation as resolved.
Show resolved Hide resolved
{
string content = "a\nb\n \t\tc\n d";

Uri uri = new Uri(@"c:\myfile.txt");

var run = new Run();
Fixed Show fixed Hide fixed
IFileSystem mockFileSystem = MockFactory.MakeMockFileSystem(uri.LocalPath, content);

var fileRegionsCache = new FileRegionsCache(100, mockFileSystem);
fileRegionsCache.SuvamTest(uri);

}
}
}