Skip to content

Commit

Permalink
Merge pull request #82 from aarondandy/upstream-changes
Browse files Browse the repository at this point in the history
Applies various changes from upstream
  • Loading branch information
aarondandy authored Nov 23, 2023
2 parents 68430db + 7425e63 commit 4e8fc98
Show file tree
Hide file tree
Showing 23 changed files with 165 additions and 84 deletions.
2 changes: 1 addition & 1 deletion WeCantSpell.Hunspell.Tests/HunspellTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ public async Task words_without_suggestions_offer_no_suggestions(string dictiona
[InlineData("files/rep.dic", "un'alunno", new[] { "un alunno" })]
[InlineData("files/rep.dic", "foo", new[] { "bar" })]
[InlineData("files/rep.dic", "vinteún", new[] { "vinte e un" })]
[InlineData("files/rep.dic", "autos", new[] { "auto's", "auto" })]
[InlineData("files/rep.dic", "autos", new[] { "auto's" })]
[InlineData("files/ngram_utf_fix.dic", "человеко", new[] { "человек" })]
[InlineData("files/utf8_nonbmp.dic", "𐏑𐏒𐏒", new[] { "𐏑 𐏒𐏒", "𐏒𐏑", "𐏒𐏒" })]
[InlineData("files/ignoresug.dic", "ինչ", new[] { "ինչ" })]
Expand Down
7 changes: 3 additions & 4 deletions WeCantSpell.Hunspell.Tests/WordListReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -845,10 +845,9 @@ public async Task can_read_korean_dic()

var actual = await WordListReader.ReadFileAsync(filePath);

actual.RootWords.Should().HaveCount(2);
actual.RootWords.Should().BeEquivalentTo(new[] {
"들어오세요",
"안녕하세요" });
actual.RootWords.Should().HaveCountGreaterThanOrEqualTo(2);
actual.RootWords.Should().Contain("들어오세요");
actual.RootWords.Should().Contain("안녕하세요");
}

[Fact]
Expand Down
4 changes: 3 additions & 1 deletion WeCantSpell.Hunspell/CandidateStack.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ namespace WeCantSpell.Hunspell;

internal sealed class CandidateStack : List<string>
{
internal const int MaxCandidateStackDepth = 2048;

public CandidateStack() : base(1)
{
// Preallocate with a small capacity as it doesn't often grow very large
Expand All @@ -12,7 +14,7 @@ public CandidateStack() : base(1)
/// <remarks>
/// apply a fairly arbitrary depth limit
/// </remarks>
public bool ExceedsArbitraryDepthLimit => Count > 2048;
public bool ExceedsArbitraryDepthLimit => Count > MaxCandidateStackDepth;

public void Push(string value)
{
Expand Down
2 changes: 2 additions & 0 deletions WeCantSpell.Hunspell/OperationLimiters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ public OperationTimedLimiter(int timeLimitMs, CancellationToken cancellationToke
private readonly CancellationToken _cancellationToken;
private bool _hasTriggeredCancellation;

public bool HasBeenCanceled => _hasTriggeredCancellation || _cancellationToken.IsCancellationRequested;

public bool QueryForCancellation()
{
if (!_hasTriggeredCancellation)
Expand Down
6 changes: 5 additions & 1 deletion WeCantSpell.Hunspell/SpellCheckResultType.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace WeCantSpell.Hunspell;

[Flags]
public enum SpellCheckResultType : byte
public enum SpellCheckResultType : ushort
{
None = 0,
/// <summary>
Expand All @@ -20,4 +20,8 @@ public enum SpellCheckResultType : byte
/// Permit only 2 dictionary words in the compound.
/// </summary>
Compound2 = 1 << 7,
/// <summary>
/// limit suggestions for the best ones, i.e. ph:
/// </summary>
BestSug = 1 << 8,
}
34 changes: 31 additions & 3 deletions WeCantSpell.Hunspell/WordList.Query.cs
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,8 @@ words is null

public WordEntry? CompoundCheck(ReadOnlySpan<char> word, int wordNum, int numSyllable, int maxwordnum, IncrementalWordList? words, IncrementalWordList rwords, bool huMovRule, bool isSug, ref SpellCheckResultType info)
{
// add a time limit to handle possible
// combinatorical explosion of the overlapping words
var opLimiter = new OperationTimedLimiter(Options.TimeLimitCompoundCheck, CancellationToken);
return CompoundCheck(word, wordNum, numSyllable, maxwordnum, words, rwords, huMovRule, isSug, ref info, ref opLimiter);
}
Expand All @@ -479,6 +481,13 @@ words is null
var oldwords = words;
var len = word.Length;

if (wordNum != 0)
{
// Reduce the number of clock checks by querying for cancellation once per method invocation
opLimiter.QueryForCancellation();
}

// setcminmax
var cmin = Affix.CompoundMin;
var cmax = word.Length - cmin + 1;

Expand All @@ -501,7 +510,7 @@ words is null

do // simplified checkcompoundpattern loop
{
if (opLimiter.QueryForCancellation())
if (opLimiter.HasBeenCanceled)
{
return null;
}
Expand Down Expand Up @@ -568,10 +577,29 @@ words is null
{
if (!huMovRule)
{
// forbid dictionary stems with COMPOUNDFORBIDFLAG in
// compound words, overriding the effect of COMPOUNDPERMITFLAG
if (searchEntryDetails[0].ContainsFlag(Affix.CompoundForbidFlag))
{
// forbid dictionary stems with COMPOUNDFORBIDFLAG in
// compound words, overriding the effect of COMPOUNDPERMITFLAG
if (!onlycpdrule && Affix.SimplifiedCompound) // would_continue
{
if (scpd == 0)
{
// given the while conditions that continue jumps to, this situation never ends
// TODO: HUNSPELL_WARNING(stderr, "break infinite loop\n");
break;
}

if (scpd > 0)
{
// under these conditions we loop again, but the assumption above
// appears to be that cmin and cmax are the original values they
// had in the outside loop
cmin = oldcmin;
cmax = oldcmax;
}
}

continue;
}

Expand Down
46 changes: 44 additions & 2 deletions WeCantSpell.Hunspell/WordList.QuerySuggest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,10 @@ internal bool Suggest(List<string> slst, string word, ref bool onlyCompoundSug)
if (slst.Count > i)
{
state.GoodSuggestion = true;
if (state.Info.HasFlag(SpellCheckResultType.BestSug))
{
goto bestSug;
}
}
}

Expand Down Expand Up @@ -769,6 +773,11 @@ internal bool Suggest(List<string> slst, string word, ref bool onlyCompoundSug)
if (!state.IsCpdSuggest || (!Affix.NoSplitSuggestions && slst.Count < sugLimit))
{
TwoWords(ref state);

if (state.Info.HasFlag(SpellCheckResultType.BestSug))
{
goto bestSug;
}
}

if (opLimiter.QueryForCancellation()) goto timerExit;
Expand Down Expand Up @@ -803,6 +812,10 @@ internal bool Suggest(List<string> slst, string word, ref bool onlyCompoundSug)
timerExit:
goto actualExit;

bestSug:
state.GoodSuggestion = true;
goto actualExit;

actualExit:
state.DestroyBuffer();
return state.GoodSuggestion;
Expand Down Expand Up @@ -1465,17 +1478,23 @@ private void ReplChars(List<string> wlst, string word, ref SuggestState state)
if (replacement[type] is { Length: > 0 } replacementValue)
{
var candidate = StringEx.ConcatString(word.AsSpan(0, r), replacementValue, word.AsSpan(r + replacement.Pattern.Length));
var sp = candidate.IndexOf(' ');

var oldNs = wlst.Count;
TestSug(wlst, candidate, ref state);
if (oldNs < wlst.Count)
{
// REP suggestions are the best, don't search other type of suggestions
state.Info |= SpellCheckResultType.BestSug;
}

// check REP suggestions with space
var sp = candidate.IndexOf(' ');
var prev = 0;
while (sp >= 0)
{
if (CheckWord(candidate.AsSpan(prev, sp - prev), cpdSuggest: 0) != 0)
{
var oldNs = wlst.Count;
oldNs = wlst.Count;
TestSug(wlst, candidate.AsSpan(sp + 1), ref state);
if (oldNs < wlst.Count)
{
Expand Down Expand Up @@ -1512,6 +1531,8 @@ private void NGramSuggest(List<string> wlst, string word, CapitalizationType cap
roots[i] = new(i);
}

var hasRoots = false;
var hasRootsPhon = false;
var lp = roots.Length - 1;
var lpphon = lp;

Expand All @@ -1521,6 +1542,13 @@ private void NGramSuggest(List<string> wlst, string word, CapitalizationType cap
word = word.GetReversed();
}

// ofz#59067 a replist entry can generate a very long word, abandon
// ngram if that odd-edge case arises
if (word.Length > MaxWordLen * 4)
{
return;
}

var hasPhoneEntries = Affix.Phone.HasItems;
var textInfo = TextInfo;
var target = hasPhoneEntries
Expand Down Expand Up @@ -1576,6 +1604,7 @@ private void NGramSuggest(List<string> wlst, string word, CapitalizationType cap
{
roots[lp].Score = sc;
roots[lp].Root = new WordEntry(hpSet.Key, hpDetail);
hasRoots = true;
lval = sc;
for (var j = 0; j < roots.Length; j++)
{
Expand All @@ -1591,6 +1620,7 @@ private void NGramSuggest(List<string> wlst, string word, CapitalizationType cap
{
roots[lpphon].ScorePhone = scphon;
roots[lpphon].RootPhon = hpSet.Key;
hasRootsPhon = true;
lval = scphon;
for (var j = 0; j < roots.Length; j++)
{
Expand All @@ -1604,6 +1634,12 @@ private void NGramSuggest(List<string> wlst, string word, CapitalizationType cap
}
}

if (!hasRoots && !hasRootsPhon)
{
// with no roots there will be no guesses and no point running ngram
return;
}

// find minimum threshold for a passable suggestion
// mangle original word three differnt ways
// and score them to generate a minimum acceptable score
Expand Down Expand Up @@ -2283,6 +2319,9 @@ private void TwoWords(ref SuggestState state)
candidate[p] = ' ';
if (cpdSuggest == 0 && CheckWord(candidate.TerminatedSpan, cpdSuggest) != 0)
{
// best solution
state.Info |= SpellCheckResultType.BestSug;

// remove not word pair suggestions
if (!good)
{
Expand All @@ -2299,6 +2338,9 @@ private void TwoWords(ref SuggestState state)
candidate[p] = '-';
if (cpdSuggest == 0 && CheckWord(candidate.TerminatedSpan, cpdSuggest) != 0)
{
// best solution
state.Info |= SpellCheckResultType.BestSug;

// remove not word pair suggestions
if (!good)
{
Expand Down
2 changes: 1 addition & 1 deletion WeCantSpell.Hunspell/WordList.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public sealed partial class WordList
{
internal const int MaxWordLen = 100;
internal const int MaxWordUtf8Len = MaxWordLen * 3;
internal const int RecursiveDepthLimit = 16384;
internal const int RecursiveDepthLimit = 0x3F00;

public static WordList CreateFromStreams(Stream dictionaryStream, Stream affixStream) =>
WordListReader.Read(dictionaryStream, affixStream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ MaxWarmupIterationCount=5 MinIterationCount=1 MinWarmupIterationCount=1
```
| Method | Mean | Error | StdDev | Min | Max | Median | Ratio |
|--------------------------- |----------:|----------:|----------:|----------:|----------:|----------:|------:|
| &#39;Check words: WeCantSpell&#39; | 18.476 ms | 0.2682 ms | 0.0696 ms | 18.367 ms | 18.554 ms | 18.480 ms | 1.00 |
| &#39;Check words: NHunspell&#39; | 6.097 ms | 0.1177 ms | 0.0420 ms | 6.019 ms | 6.132 ms | 6.111 ms | 0.33 |
| &#39;Check words: WeCantSpell&#39; | 18.338 ms | 0.2788 ms | 0.0432 ms | 18.287 ms | 18.392 ms | 18.337 ms | 1.00 |
| &#39;Check words: NHunspell&#39; | 6.060 ms | 0.1201 ms | 0.0794 ms | 5.973 ms | 6.175 ms | 6.049 ms | 0.33 |
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Method,Job,AnalyzeLaunchVariance,EvaluateOverhead,MaxAbsoluteError,MaxRelativeError,MinInvokeCount,MinIterationTime,OutlierMode,Affinity,EnvironmentVariables,Jit,LargeAddressAware,Platform,PowerPlanMode,Runtime,AllowVeryLargeObjects,Concurrent,CpuGroups,Force,HeapAffinitizeMask,HeapCount,NoAffinitize,RetainVm,Server,Arguments,BuildConfiguration,Clock,EngineFactory,NuGetReferences,Toolchain,IsMutator,InvocationCount,IterationCount,IterationTime,LaunchCount,MaxIterationCount,MaxWarmupIterationCount,MemoryRandomization,MinIterationCount,MinWarmupIterationCount,RunStrategy,UnrollFactor,WarmupCount,Mean,Error,StdDev,Min,Max,Median,Ratio
'Check words: WeCantSpell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,18.476 ms,0.2682 ms,0.0696 ms,18.367 ms,18.554 ms,18.480 ms,1.00
'Check words: NHunspell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,6.097 ms,0.1177 ms,0.0420 ms,6.019 ms,6.132 ms,6.111 ms,0.33
'Check words: WeCantSpell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,18.338 ms,0.2788 ms,0.0432 ms,18.287 ms,18.392 ms,18.337 ms,1.00
'Check words: NHunspell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,6.060 ms,0.1201 ms,0.0794 ms,5.973 ms,6.175 ms,6.049 ms,0.33
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<html lang='en'>
<head>
<meta charset='utf-8' />
<title>WeCantSpell.Hunspell.Benchmarks.NHunspell.Suites.CheckEnUsSuite-20231123-110714</title>
<title>WeCantSpell.Hunspell.Benchmarks.NHunspell.Suites.CheckEnUsSuite-20231123-122835</title>

<style type="text/css">
table { border-collapse: collapse; display: block; width: 100%; overflow: auto; }
Expand All @@ -25,8 +25,8 @@
<table>
<thead><tr><th>Method </th><th>Mean</th><th>Error</th><th>StdDev</th><th>Min</th><th>Max</th><th>Median</th><th>Ratio</th>
</tr>
</thead><tbody><tr><td>&#39;Check words: WeCantSpell&#39;</td><td>18.476 ms</td><td>0.2682 ms</td><td>0.0696 ms</td><td>18.367 ms</td><td>18.554 ms</td><td>18.480 ms</td><td>1.00</td>
</tr><tr><td>&#39;Check words: NHunspell&#39;</td><td>6.097 ms</td><td>0.1177 ms</td><td>0.0420 ms</td><td>6.019 ms</td><td>6.132 ms</td><td>6.111 ms</td><td>0.33</td>
</thead><tbody><tr><td>&#39;Check words: WeCantSpell&#39;</td><td>18.338 ms</td><td>0.2788 ms</td><td>0.0432 ms</td><td>18.287 ms</td><td>18.392 ms</td><td>18.337 ms</td><td>1.00</td>
</tr><tr><td>&#39;Check words: NHunspell&#39;</td><td>6.060 ms</td><td>0.1201 ms</td><td>0.0794 ms</td><td>5.973 ms</td><td>6.175 ms</td><td>6.049 ms</td><td>0.33</td>
</tr></tbody></table>
</body>
</html>
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ MaxWarmupIterationCount=5 MinIterationCount=1 MinWarmupIterationCount=1
```
| Method | Mean | Error | StdDev | Min | Max | Median | Ratio | RatioSD |
|----------------------------- |-----------:|---------:|---------:|-----------:|-----------:|-----------:|------:|--------:|
| &#39;Suggest words: WeCantSpell&#39; | 751.7 ms | 9.60 ms | 1.49 ms | 750.2 ms | 753.5 ms | 751.5 ms | 1.00 | 0.00 |
| &#39;Suggest words: NHunspell&#39; | 1,896.1 ms | 34.36 ms | 15.25 ms | 1,875.9 ms | 1,908.8 ms | 1,905.4 ms | 2.52 | 0.02 |
| &#39;Suggest words: WeCantSpell&#39; | 759.0 ms | 7.36 ms | 1.14 ms | 758.1 ms | 760.7 ms | 758.7 ms | 1.00 | 0.00 |
| &#39;Suggest words: NHunspell&#39; | 1,895.0 ms | 32.72 ms | 17.11 ms | 1,873.2 ms | 1,911.9 ms | 1,899.5 ms | 2.50 | 0.02 |
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Method,Job,AnalyzeLaunchVariance,EvaluateOverhead,MaxAbsoluteError,MaxRelativeError,MinInvokeCount,MinIterationTime,OutlierMode,Affinity,EnvironmentVariables,Jit,LargeAddressAware,Platform,PowerPlanMode,Runtime,AllowVeryLargeObjects,Concurrent,CpuGroups,Force,HeapAffinitizeMask,HeapCount,NoAffinitize,RetainVm,Server,Arguments,BuildConfiguration,Clock,EngineFactory,NuGetReferences,Toolchain,IsMutator,InvocationCount,IterationCount,IterationTime,LaunchCount,MaxIterationCount,MaxWarmupIterationCount,MemoryRandomization,MinIterationCount,MinWarmupIterationCount,RunStrategy,UnrollFactor,WarmupCount,Mean,Error,StdDev,Min,Max,Median,Ratio,RatioSD
'Suggest words: WeCantSpell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,751.7 ms,9.60 ms,1.49 ms,750.2 ms,753.5 ms,751.5 ms,1.00,0.00
'Suggest words: NHunspell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,"1,896.1 ms",34.36 ms,15.25 ms,"1,875.9 ms","1,908.8 ms","1,905.4 ms",2.52,0.02
'Suggest words: WeCantSpell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,759.0 ms,7.36 ms,1.14 ms,758.1 ms,760.7 ms,758.7 ms,1.00,0.00
'Suggest words: NHunspell',Job-TPWOKF,False,Default,Default,Default,1,Default,Default,1111111111111111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET Framework 4.8,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,1.0000 s,Default,20,5,Default,1,1,Default,16,Default,"1,895.0 ms",32.72 ms,17.11 ms,"1,873.2 ms","1,911.9 ms","1,899.5 ms",2.50,0.02
Loading

0 comments on commit 4e8fc98

Please sign in to comment.