From f12c87755b2891a8d77e88611c5d1e528b183561 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 7 Jul 2025 15:11:36 -0400 Subject: [PATCH 01/28] One complete chunk of porting --- .../Analysis/QuotationMarkDirection.cs | 8 + .../Corpora/Analysis/QuotationMarkMetadata.cs | 66 ++++++ .../Analysis/QuotationMarkStringMatch.cs | 154 ++++++++++++ .../Analysis/QuotationMarkTabulator.cs | 113 +++++++++ .../Corpora/Analysis/QuoteConvention.cs | 174 ++++++++++++++ .../Corpora/Analysis/QuoteConventionSet.cs | 222 ++++++++++++++++++ .../Corpora/Analysis/TextSegment.cs | 155 ++++++++++++ .../Corpora/Analysis/UsfmMarkerType.cs | 13 + 8 files changed, 905 insertions(+) create mode 100644 src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/TextSegment.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs new file mode 100644 index 00000000..80dd93ac --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs @@ -0,0 +1,8 @@ +namespace SIL.Machine.Corpora.Analysis +{ + public enum QuotationMarkDirection + { + Opening, + Closing + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs new file mode 100644 index 00000000..c1b92f26 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs @@ -0,0 +1,66 @@ +namespace SIL.Machine.Corpora.Analysis +{ + public class QuotationMarkMetadata + { + public string QuotationMark { get; } + public int Depth { get; } + public QuotationMarkDirection Direction { get; } + public TextSegment TextSegment { get; } + public int StartIndex { get; } + private int EndIndex { get; } + + public QuotationMarkMetadata( + string quotationMark, + int depth, + QuotationMarkDirection direction, + TextSegment textSegment, + int startIndex, + int endIndex + ) + { + QuotationMark = quotationMark; + Depth = depth; + Direction = direction; + TextSegment = textSegment; + StartIndex = startIndex; + EndIndex = endIndex; + } + + public override bool Equals(object obj) + { + if (!(obj is QuotationMarkMetadata other)) + { + return false; + } + return QuotationMark.Equals(other.QuotationMark) + && Depth.Equals(other.Depth) + && Direction.Equals(other.Direction) + && TextSegment.Equals(other.TextSegment) + && StartIndex.Equals(other.StartIndex) + && EndIndex.Equals(other.EndIndex); + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + QuotationMark.GetHashCode(); + hashCode = hashCode * 31 + Depth.GetHashCode(); + hashCode = hashCode * 31 + Direction.GetHashCode(); + hashCode = hashCode * 31 + TextSegment.GetHashCode(); + hashCode = hashCode * 31 + StartIndex.GetHashCode(); + hashCode = hashCode * 31 + EndIndex.GetHashCode(); + return hashCode; + } + + public void UpdateQuotationMark(QuoteConvention quoteConvention) + { + string updatedQuotationMark = quoteConvention.GetExpectedQuotationMark(Depth, Direction); + if (updatedQuotationMark.Equals(QuotationMark)) + { + return; + } + + TextSegment.ReplaceSubstring(StartIndex, EndIndex, updatedQuotationMark); + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs new file mode 100644 index 00000000..445b3fb6 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs @@ -0,0 +1,154 @@ +using System; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class QuotationMarkStringMatch + { + private static readonly Regex LetterPattern = new Regex(@"[\p{L}\uD838[\uDE00-\uDE8F]]", RegexOptions.Compiled); + private static readonly Regex LatinLetterPattern = new Regex(@"^\p{IsBasicLatin}$", RegexOptions.Compiled); + private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); + private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); + private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); + + public TextSegment TextSegment { get; } + public int StartIndex { get; } + public int EndIndex { get; } + + public QuotationMarkStringMatch(TextSegment textSegment, int startIndex, int endIndex) + { + TextSegment = textSegment; + StartIndex = startIndex; + EndIndex = endIndex; + } + + public string QuotationMark => TextSegment.Text.Substring(StartIndex, EndIndex - StartIndex); + + public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventionSet) => + quoteConventionSet.IsValidOpeningQuotationMark(QuotationMark); + + public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventionSet) => + quoteConventionSet.IsValidClosingQuotationMark(QuotationMark); + + public bool QuotationMarkMatches(Regex regexPattern) => regexPattern.Matches(QuotationMark).Count > 0; + + public bool NextCharacterMatches(Regex regexPattern) => + NextCharacter != null && regexPattern.Matches(NextCharacter).Count > 0; + + public bool PreviousCharacterMatches(Regex regexPattern) => + PreviousCharacter != null && regexPattern.Matches(PreviousCharacter).Count > 0; + + public string PreviousCharacter + { + get + { + if (StartIndex == 0) + { + TextSegment previousSegment = TextSegment.PreviousSegment; + if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) + { + return previousSegment.Text[previousSegment.Text.Length - 1].ToString(); + } + return null; + } + return TextSegment.Text[StartIndex - 1].ToString(); + } + } + + public string NextCharacter + { + get + { + if (IsAtEndOfSegment) + { + TextSegment nextSegment = TextSegment.NextSegment; + if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) + { + return nextSegment.Text[0].ToString(); + } + return null; + } + return TextSegment.Text[EndIndex].ToString(); + } + } + + public bool LeadingSubstringMatches(Regex regexPattern) => + regexPattern.Matches(TextSegment.SubstringBefore(StartIndex)).Count > 0; + + public bool TrailingSubstringMatches(Regex regexPattern) => + regexPattern.Matches(TextSegment.SubstringAfter(EndIndex)).Count > 0; + + // this assumes that the two matches occur in the same verse + public bool Precedes(QuotationMarkStringMatch other) + { + return TextSegment.IndexInVerse < other.TextSegment.IndexInVerse + || (TextSegment.IndexInVerse == other.TextSegment.IndexInVerse && StartIndex < other.StartIndex); + } + + // not used, but a useful method for debugging + public string Context() + { + int contextStartIndex = Math.Max(StartIndex - 10, 0); + int contextEndIndex = Math.Min(EndIndex + 10, TextSegment.Length); + return TextSegment.Text.Substring(contextStartIndex, contextEndIndex - contextStartIndex); + } + + public QuotationMarkMetadata Resolve(int depth, QuotationMarkDirection direction) => + new QuotationMarkMetadata(QuotationMark, depth, direction, TextSegment, StartIndex, EndIndex); + + public bool IsAtStartOfSegment => StartIndex == 0; + + public bool IsAtEndOfSegment => EndIndex == TextSegment.Length; + + public bool HasLeadingWhitespace() + { + if (PreviousCharacter == null) + { + return TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) + || TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Embed) + || TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse); + } + return PreviousCharacterMatches(WhitespacePattern); + } + + public bool HasTrailingWhitespace() + { + return NextCharacterMatches(WhitespacePattern); + } + + public bool HasLeadingPunctuation() + { + return PreviousCharacterMatches(PunctuationPattern); + } + + public bool HasTrailingPunctuation() + { + return NextCharacterMatches(PunctuationPattern); + } + + public bool HasLetterInLeadingSubstring() + { + return LeadingSubstringMatches(LetterPattern); + } + + public bool HasLetterInTrailingSubstring() + { + return TrailingSubstringMatches(LetterPattern); + } + + public bool HasLeadingLatinLetter() + { + return PreviousCharacterMatches(LatinLetterPattern); + } + + public bool HasTrailingLatinLetter() + { + return NextCharacterMatches(LatinLetterPattern); + } + + public bool HasQuoteIntroducerInLeadingSubstring() + { + return LeadingSubstringMatches(QuoteIntroducerPattern); + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs new file mode 100644 index 00000000..073759c4 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs @@ -0,0 +1,113 @@ +using System; +using System.Collections.Generic; +using SIL.Extensions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class QuotationMarkCounts + { + private readonly Dictionary _stringCounts; + + public int TotalCount { get; private set; } + + public QuotationMarkCounts() + { + _stringCounts = new Dictionary(); + TotalCount = 0; + } + + public void CountQuotationMark(string quotationMark) + { + if (!_stringCounts.ContainsKey(quotationMark)) + { + _stringCounts[quotationMark] = 0; + } + _stringCounts[quotationMark]++; + TotalCount++; + } + + public (string BestString, int BestStringCount, int TotalStringCount) FindBestQuotationMarkProportion() + { + string bestString = _stringCounts.MaxBy(kvp => kvp.Value).Key; + return (bestString, _stringCounts[bestString], TotalCount); + } + + public int CalculateNumDifferences(string expectedQuotationMark) + { + if (!_stringCounts.TryGetValue(expectedQuotationMark, out int count)) + { + return TotalCount; + } + return TotalCount - count; + } + } + + public class QuotationMarkTabulator + { + private readonly Dictionary< + (int Depth, QuotationMarkDirection Direction), + QuotationMarkCounts + > _quotationCountsByDepthAndDirection; + + public QuotationMarkTabulator() + { + _quotationCountsByDepthAndDirection = + new Dictionary<(int Depth, QuotationMarkDirection Direction), QuotationMarkCounts>(); + } + + public void Tabulate(List quotationMarks) + { + foreach (QuotationMarkMetadata quotationMark in quotationMarks) + { + CountQuotationMark(quotationMark); + } + } + + private void CountQuotationMark(QuotationMarkMetadata quote) + { + (int Depth, QuotationMarkDirection Direction) key = (quote.Depth, quote.Direction); + string quotationMark = quote.QuotationMark; + if (!_quotationCountsByDepthAndDirection.ContainsKey(key)) + { + _quotationCountsByDepthAndDirection[key] = new QuotationMarkCounts(); + } + _quotationCountsByDepthAndDirection[key].CountQuotationMark(quotationMark); + } + + // Used in print function + // private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction) => + // _quotationCountsByDepthAndDirection.ContainsKey((depth, direction)); + + // private ( + // string BestQuotationMark, + // int BestQuotationMarkCount, + // int TotalQuotationMarkCount + // ) FindMostCommonQuotationMarkWithDepthAndDirection(int depth, QuotationMarkDirection direction) + // { + // return _quotationCountsByDepthAndDirection[(depth, direction)].FindBestQuotationMarkProportion(); + // } + + public double CalculateSimilarity(QuoteConvention quoteConvention) + { + double numDifferences = 0.0; + double numTotalQuotationMarks = 0.0; + foreach ((int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys) + { + string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction); + + // give higher weight to shallower depths, since deeper marks are more likely to be mistakes + numDifferences += ( + _quotationCountsByDepthAndDirection[(depth, direction)] + .CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth) + ); + numTotalQuotationMarks += + _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth); + } + if (numTotalQuotationMarks == 0.0) + { + return 0.0; + } + return 1 - (numDifferences / numTotalQuotationMarks); + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs b/src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs new file mode 100644 index 00000000..68404e4f --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs @@ -0,0 +1,174 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace SIL.Machine.Corpora.Analysis +{ + public class SingleLevelQuoteConvention + { + public static readonly IReadOnlyDictionary QuoteNormalizationMap = new Dictionary() + { + { "\u00ab", '\'' }, + { "\u00bb", '"' }, + { "\u2018", '\'' }, + { "\u2019", '\'' }, + { "\u201a", '\'' }, + { "\u201c", '"' }, + { "\u201d", '"' }, + { "\u201e", '"' }, + { "\u300a", '"' }, + { "\u300b", '"' }, + { "\u300c", '"' }, + { "\u300d", '"' } + }; + public string OpeningQuote { get; } + public string ClosingQuote { get; } + + public SingleLevelQuoteConvention(string openingQuote, string closingQuote) + { + OpeningQuote = openingQuote; + ClosingQuote = closingQuote; + } + + public SingleLevelQuoteConvention Normalize() + { + string normalizedOpeningQuote = QuoteNormalizationMap.TryGetValue(OpeningQuote, out char quote) + ? quote.ToString() + : OpeningQuote; + string normalizedClosingQuote = QuoteNormalizationMap.TryGetValue(ClosingQuote, out quote) + ? quote.ToString() + : ClosingQuote; + return new SingleLevelQuoteConvention(normalizedOpeningQuote, normalizedClosingQuote); + } + } + + public class QuoteConvention + { + public string Name { get; } + + public IReadOnlyList Levels { get; } + + public QuoteConvention(string name, List levels) + { + Name = name; + Levels = levels; + } + + public int NumLevels => Levels.Count; + + public string GetOpeningQuoteAtLevel(int level) + { + return Levels[level - 1].OpeningQuote; + } + + public string GetClosingQuoteAtLevel(int level) + { + return Levels[level - 1].ClosingQuote; + } + + public string GetExpectedQuotationMark(int depth, QuotationMarkDirection direction) + { + if (depth > NumLevels || depth < 1) + return ""; + return direction == QuotationMarkDirection.Opening + ? GetOpeningQuoteAtLevel(depth) + : GetClosingQuoteAtLevel(depth); + } + + private bool IncludesOpeningQuotationMark(string openingQuotationMark) + { + foreach (SingleLevelQuoteConvention level in Levels) + { + if (level.OpeningQuote == openingQuotationMark) + return true; + } + return false; + } + + private bool IncludesClosingQuotationMark(string closingQuotationMark) + { + foreach (SingleLevelQuoteConvention level in Levels) + { + if (level.ClosingQuote == closingQuotationMark) + return true; + } + return false; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + var depths = new HashSet(); + foreach ((int depth, SingleLevelQuoteConvention level) in Levels.Select((l, i) => (i + 1, l))) + { + if (direction == QuotationMarkDirection.Opening && level.OpeningQuote == quotationMark) + depths.Add(depth); + else if (direction == QuotationMarkDirection.Closing && level.ClosingQuote == quotationMark) + depths.Add(depth); + } + return depths; + } + + public bool IsCompatibleWithObservedQuotationMarks( + List openingQuotationMarks, + List closingQuotationMarks + ) + { + foreach (string openingQuotationMark in openingQuotationMarks) + { + if (!IncludesOpeningQuotationMark(openingQuotationMark)) + return false; + } + foreach (string closingQuotationMark in closingQuotationMarks) + { + if (!IncludesClosingQuotationMark(closingQuotationMark)) + return false; + } + + // we require the first-level quotes to have been observed + if (!openingQuotationMarks.Contains(GetOpeningQuoteAtLevel(1))) + return false; + if (!closingQuotationMarks.Contains(GetClosingQuoteAtLevel(1))) + return false; + return true; + } + + public QuoteConvention Normalize() + { + return new QuoteConvention(Name + "_normalized", Levels.Select(l => l.Normalize()).ToList()); + } + + public void PrintSummary() + { + Console.WriteLine(GetSummaryMessage()); + } + + private string GetSummaryMessage() + { + var summary = new StringBuilder(Name + "\n"); + foreach ((int level, SingleLevelQuoteConvention convention) in Levels.Select((l, i) => (i, l))) + { + string ordinalName = GetOrdinalName(level + 1); + summary.Append($"{convention.OpeningQuote}{ordinalName}-level quote{convention.ClosingQuote}\n"); + } + return summary.ToString(); + } + + private string GetOrdinalName(int level) + { + switch (level) + { + case 1: + return "First"; + case 2: + return "Second"; + case 3: + return "Third"; + case 4: + return "Fourth"; + default: + return level.ToString() + "th"; + } + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs b/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs new file mode 100644 index 00000000..d94bc93b --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs @@ -0,0 +1,222 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using SIL.Extensions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class QuoteConventionSet + { + public IReadOnlyList Conventions { get; } + + public Regex OpeningQuotationMarkRegex { get; private set; } + public Regex ClosingQuotationMarkRegex { get; private set; } + public Regex AllQuotationMarkRegex { get; private set; } + + public IReadOnlyDictionary> ClosingMarksByOpeningMark { get; private set; } + public IReadOnlyDictionary> OpeningMarksByClosingMark { get; private set; } + + public QuoteConventionSet(List conventions) + { + Conventions = conventions; + CreateQuoteRegexes(); + CreateQuotationMarkPairMap(); + } + + public override bool Equals(object obj) + { + if (!(obj is QuoteConventionSet quoteConventionSet)) + return false; + return Conventions.SequenceEqual(quoteConventionSet.Conventions); + } + + public override int GetHashCode() + { + int hashCode = 23; + return hashCode * 31 + Conventions.GetHashCode(); + } + + private void CreateQuoteRegexes() + { + var openingQuotationMarks = new HashSet(); + var closingQuotationMarks = new HashSet(); + var allQuotationMarks = new HashSet(); + + if (Conventions.Count > 0) + { + foreach (QuoteConvention convention in Conventions) + { + for (int level = 1; level < convention.NumLevels + 1; level++) + { + string openingQuote = convention.GetOpeningQuoteAtLevel(level); + string closingQuote = convention.GetClosingQuoteAtLevel(level); + openingQuotationMarks.Add(openingQuote); + closingQuotationMarks.Add(closingQuote); + allQuotationMarks.Add(openingQuote); + allQuotationMarks.Add(closingQuote); + } + } + + if (allQuotationMarks.Count > 0) + { + OpeningQuotationMarkRegex = new Regex( + @"[" + string.Join("", openingQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + ClosingQuotationMarkRegex = new Regex( + @"[" + string.Join("", closingQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + AllQuotationMarkRegex = new Regex( + @"[" + string.Join("", allQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + } + } + + if (openingQuotationMarks.Count == 0) + { + OpeningQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + } + if (closingQuotationMarks.Count == 0) + { + ClosingQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + } + if (allQuotationMarks.Count == 0) + { + AllQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + } + } + + private void CreateQuotationMarkPairMap() + { + var closingMarksByOpeningMark = new Dictionary>(); + var openingMarksByClosingMark = new Dictionary>(); + foreach (QuoteConvention convention in Conventions) + { + for (int level = 1; level < convention.NumLevels + 1; level++) + { + string openingQuote = convention.GetOpeningQuoteAtLevel(level); + string closingQuote = convention.GetClosingQuoteAtLevel(level); + if (!closingMarksByOpeningMark.ContainsKey(openingQuote)) + { + closingMarksByOpeningMark[openingQuote] = new HashSet(); + } + closingMarksByOpeningMark[openingQuote].Add(closingQuote); + if (!openingMarksByClosingMark.ContainsKey(closingQuote)) + { + openingMarksByClosingMark[closingQuote] = new HashSet(); + } + closingMarksByOpeningMark[closingQuote].Add(openingQuote); + } + } + } + + public QuoteConvention GetQuoteConventionByName(string name) + { + foreach (QuoteConvention convention in Conventions) + { + if (convention.Name == name) + { + return convention; + } + } + return null; + } + + public IReadOnlyList GetAllQuoteConventionNames() + { + return Conventions.Select(c => c.Name).OrderBy(c => c).ToList(); + } + + public IReadOnlyList GetPossibleOpeningQuotationMarks() + { + return ClosingMarksByOpeningMark.Keys.OrderBy(k => k).ToList(); + } + + public IReadOnlyList GetPossibleClosingQuotationMarks() + { + return OpeningMarksByClosingMark.Keys.OrderBy(k => k).ToList(); + } + + public bool IsValidOpeningQuotationMark(string quotationMark) + { + return ClosingMarksByOpeningMark.ContainsKey(quotationMark); + } + + public bool IsValidClosingQuotationMark(string quotationMark) + { + return OpeningMarksByClosingMark.ContainsKey(quotationMark); + } + + public bool MarksAreAValidPair(string openingMark, string closingMark) + { + return ClosingMarksByOpeningMark.TryGetValue(openingMark, out HashSet set) + && set.Contains(closingMark); + } + + public HashSet GetPossiblePairedQuotationMarks(string quotationMark) + { + var pairedQuotationMarks = new HashSet(); + if (ClosingMarksByOpeningMark.TryGetValue(quotationMark, out HashSet set)) + { + pairedQuotationMarks.AddRange(set); + } + if (OpeningMarksByClosingMark.TryGetValue(quotationMark, out set)) + { + pairedQuotationMarks.AddRange(set); + } + return pairedQuotationMarks; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + var depths = new HashSet(); + foreach (QuoteConvention convention in Conventions) + { + depths.AddRange(convention.GetPossibleDepths(quotationMark, direction)); + } + return depths; + } + + public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) + { + foreach (QuoteConvention convention in Conventions) + { + if (convention.GetExpectedQuotationMark(depth, direction) == quotationMark) + return true; + } + return false; + } + + public QuoteConventionSet FilterToCompatibleQuoteConventions( + List openingQuotationMarks, + List closingQuotationMarks + ) + { + return new QuoteConventionSet( + Conventions + .Where(c => c.IsCompatibleWithObservedQuotationMarks(openingQuotationMarks, closingQuotationMarks)) + .ToList() + ); + } + + public (QuoteConvention Convention, double Similarity) FindMostSimilarConvention( + QuotationMarkTabulator tabulatedQuotationMarks + ) + { + double bestSimilarity = double.MinValue; + QuoteConvention bestQuoteConvention = null; + foreach (QuoteConvention quoteConvention in Conventions) + { + double similarity = tabulatedQuotationMarks.CalculateSimilarity(quoteConvention); + if (similarity > bestSimilarity) + { + bestSimilarity = similarity; + bestQuoteConvention = quoteConvention; + } + } + return (bestQuoteConvention, bestSimilarity); + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs b/src/SIL.Machine/Corpora/Analysis/TextSegment.cs new file mode 100644 index 00000000..7aa89a79 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/TextSegment.cs @@ -0,0 +1,155 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora.Analysis +{ + public class TextSegment + { + private string _text; + private UsfmMarkerType _immediatePrecedingMarker; + private readonly HashSet _markersInPrecedingContext; + private TextSegment _previousSegment; + private TextSegment _nextSegment; + private int _indexInVerse; + private int _numSegmentsInVerse; + private UsfmToken _usfmToken; + + public TextSegment() + { + _text = ""; + _immediatePrecedingMarker = UsfmMarkerType.NoMarker; + _markersInPrecedingContext = new HashSet(); + _previousSegment = null; + _nextSegment = null; + _indexInVerse = 0; + _numSegmentsInVerse = 0; + _usfmToken = null; + } + + public override bool Equals(object obj) + { + if (!(obj is TextSegment t)) + { + return false; + } + return _text.Equals(t._text) + && _indexInVerse.Equals(t._indexInVerse) + && _numSegmentsInVerse.Equals(t._numSegmentsInVerse) + && _usfmToken.Equals(t._usfmToken) + && _immediatePrecedingMarker.Equals(t._immediatePrecedingMarker); + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + _text.GetHashCode(); + hashCode = hashCode * 31 + _indexInVerse.GetHashCode(); + hashCode = hashCode * 31 + _numSegmentsInVerse.GetHashCode(); + hashCode = hashCode * 31 + _usfmToken.GetHashCode(); + return hashCode * 31 + _immediatePrecedingMarker.GetHashCode(); + } + + public string Text => _text; + + public TextSegment PreviousSegment => _previousSegment; + + public TextSegment NextSegment => _nextSegment; + + public int IndexInVerse => _indexInVerse; + + public int Length => _text.Length; + + public string SubstringBefore(int index) + { + return _text.Substring(0, _text.Length - index); + } + + public string SubstringAfter(int index) + { + return _text.Substring(index); + } + + public bool MarkerIsInPrecedingContext(UsfmMarkerType marker) + { + return _markersInPrecedingContext.Contains(marker); + } + + public bool IsFirstSegmentInVerse() + { + return _indexInVerse == 0; + } + + public bool IsLastSegmentInVerse() + { + return _indexInVerse == _numSegmentsInVerse - 1; + } + + public void ReplaceSubstring(int startIndex, int endIndex, string replacement) + { + _text = SubstringBefore(startIndex) + replacement + SubstringAfter(endIndex); + if (_usfmToken != null) + { + _usfmToken.Text = _text; + } + } + + public void SetPreviousSegment(TextSegment previousSegment) + { + _previousSegment = previousSegment; + } + + public void SetNextSegment(TextSegment nextSegment) + { + _nextSegment = nextSegment; + } + + public void SetIndexInVerse(int indexInVerse) + { + _indexInVerse = indexInVerse; + } + + public void SetNumSegmentsInVerse(int numSegmentsInVerse) + { + _numSegmentsInVerse = numSegmentsInVerse; + } + + public class Builder + { + private readonly TextSegment _textSegment; + + public Builder() + { + _textSegment = new TextSegment(); + } + + public Builder SetPreviousSegment(TextSegment previousSegment) + { + _textSegment._previousSegment = previousSegment; + return this; + } + + public Builder AddPrecedingMarker(UsfmMarkerType marker) + { + _textSegment._immediatePrecedingMarker = marker; + _textSegment._markersInPrecedingContext.Add(marker); + return this; + } + + public Builder SetUsfmToken(UsfmToken token) + { + _textSegment._usfmToken = token; + return this; + } + + public Builder SetText(string text) + { + _textSegment._text = text; + return this; + } + + public TextSegment Build() + { + return _textSegment; + } + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs b/src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs new file mode 100644 index 00000000..9ef2b55a --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs @@ -0,0 +1,13 @@ +namespace SIL.Machine.Corpora.Analysis +{ + public enum UsfmMarkerType + { + Paragraph, + Character, + Verse, + Chapter, + Embed, + Other, + NoMarker, + } +} From 4cd7a59ac962f1ee9b3a50decc2c8969906e7fd2 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 8 Jul 2025 14:50:28 -0400 Subject: [PATCH 02/28] Another complete block of porting --- src/SIL.Machine/Corpora/Analysis/Chapter.cs | 9 + .../Analysis/PreliminaryQuotationAnalyzer.cs | 492 ++++++++++++++++++ .../Corpora/Analysis/QuotationMarkFinder.cs | 54 ++ .../Analysis/QuotationMarkStringMatch.cs | 10 +- .../Corpora/Analysis/QuoteConventionSet.cs | 8 + .../Corpora/Analysis/TextSegment.cs | 2 +- src/SIL.Machine/Corpora/Analysis/Verse.cs | 25 + 7 files changed, 594 insertions(+), 6 deletions(-) create mode 100644 src/SIL.Machine/Corpora/Analysis/Chapter.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/Verse.cs diff --git a/src/SIL.Machine/Corpora/Analysis/Chapter.cs b/src/SIL.Machine/Corpora/Analysis/Chapter.cs new file mode 100644 index 00000000..dc12cfa1 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/Chapter.cs @@ -0,0 +1,9 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora.Analysis +{ + public class Chapter + { + public List Verses { get; set; } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs b/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs new file mode 100644 index 00000000..ad9d8537 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs @@ -0,0 +1,492 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class ApostropheProportionStatistics + { + private int _numCharacters; + private int _numApostrophes; + + public ApostropheProportionStatistics() + { + Reset(); + } + + public void Reset() + { + _numCharacters = 0; + _numApostrophes = 0; + } + + public void CountCharacters(TextSegment textSegment) + { + _numCharacters += textSegment.Length; + } + + public void AddApostrophe() + { + _numApostrophes++; + } + + public bool IsApostropheProportionGreaterThan(double threshold) + { + if (_numCharacters == 0) + return false; + return (_numApostrophes / _numCharacters) > threshold; + } + } + + public class QuotationMarkWordPositions + { + private Dictionary _wordInitialOccurrences; + private Dictionary _midWordOccurrences; + private Dictionary _wordFinalOccurrences; + + public QuotationMarkWordPositions() + { + Reset(); + } + + public void Reset() + { + _wordInitialOccurrences = new Dictionary(); + _midWordOccurrences = new Dictionary(); + _wordFinalOccurrences = new Dictionary(); + } + + public void CountWordInitialApostrophe(string quotationMark) + { + if (!_wordInitialOccurrences.ContainsKey(quotationMark)) + { + _wordInitialOccurrences[quotationMark] = 0; + } + _wordInitialOccurrences[quotationMark]++; + } + + public void CountMidWordApostrophe(string quotationMark) + { + if (!_midWordOccurrences.ContainsKey(quotationMark)) + { + _midWordOccurrences[quotationMark] = 0; + } + _midWordOccurrences[quotationMark]++; + } + + public void CountWordFinalApostrophe(string quotationMark) + { + if (!_wordFinalOccurrences.ContainsKey(quotationMark)) + { + _wordFinalOccurrences[quotationMark] = 0; + } + _wordFinalOccurrences[quotationMark]++; + } + + private int GetWordInitialOccurrences(string quotationMark) + { + return _wordInitialOccurrences.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetMidWordOccurrences(string quotationMark) + { + return _midWordOccurrences.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetWordFinalOccurrences(string quotationMark) + { + return _wordFinalOccurrences.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetTotalOccurrences(string quotationMark) + { + return GetWordInitialOccurrences(quotationMark) + + GetMidWordOccurrences(quotationMark) + + GetWordFinalOccurrences(quotationMark); + } + + public bool IsMarkRarelyInitial(string quotationMark) + { + int numInitialMarks = GetWordInitialOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 && (numInitialMarks / numTotalMarks) < 0.1; + } + + public bool IsMarkRarelyFinal(string quotationMark) + { + int numFinalMarks = GetWordFinalOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 && (numFinalMarks / numTotalMarks) < 0.1; + } + + public bool AreInitialAndFinalRatesSimilar(string quotationMark) + { + int numInitialMarks = GetWordInitialOccurrences(quotationMark); + int numFinalMarks = GetWordFinalOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 && (Math.Abs(numInitialMarks - numFinalMarks) / numTotalMarks) < 0.3; + } + + public bool IsMarkCommonlyMidWord(string quotationMark) + { + int numMidWordMarks = GetMidWordOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 && (numMidWordMarks / numTotalMarks) > 0.3; + } + } + + public class QuotationMarkSequences + { + private Dictionary _earlierQuotationMarkCounts; + private Dictionary _laterQuotationMarkCounts; + + public QuotationMarkSequences() + { + Reset(); + } + + public void Reset() + { + _earlierQuotationMarkCounts = new Dictionary(); + _laterQuotationMarkCounts = new Dictionary(); + } + + public void RecordEarlierQuotationMark(string quotationMark) + { + if (!_earlierQuotationMarkCounts.ContainsKey(quotationMark)) + { + _earlierQuotationMarkCounts[quotationMark] = 0; + } + _earlierQuotationMarkCounts[quotationMark] += 1; + } + + public void RecordLaterQuotationMark(string quotationMark) + { + if (!_laterQuotationMarkCounts.ContainsKey(quotationMark)) + { + _laterQuotationMarkCounts[quotationMark] = 0; + } + _laterQuotationMarkCounts[quotationMark] += 1; + } + + private int GetEarlierOccurrences(string quotationMark) + { + return _earlierQuotationMarkCounts.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetLaterOccurrences(string quotationMark) + { + return _laterQuotationMarkCounts.TryGetValue(quotationMark, out int count) ? count : 0; + } + + public bool IsMarkMuchMoreCommonEarlier(string quotationMark) + { + int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); + int numLateOccurrences = GetLaterOccurrences(quotationMark); + return (numLateOccurrences == 0 && numEarlyOccurrences > 5) + || numEarlyOccurrences > numLateOccurrences * 10; + } + + public bool IsMarkMuchMoreCommonLater(string quotationMark) + { + int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); + int numLateOccurrences = GetLaterOccurrences(quotationMark); + return (numEarlyOccurrences == 0 && numLateOccurrences > 5) + || numLateOccurrences > numEarlyOccurrences * 10; + } + + public bool IsMarkCommonEarlyAndLate(string quotationMark) + { + int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); + int numLateOccurrences = GetLaterOccurrences(quotationMark); + return numEarlyOccurrences > 0 + && (Math.Abs(numLateOccurrences - numEarlyOccurrences) / numEarlyOccurrences) < 0.2; + } + } + + public class QuotationMarkGrouper + { + private readonly QuoteConventionSet _quoteConventionSet; + private Dictionary> _groupedQuotationMarks; //TODO Different data structure? + + public QuotationMarkGrouper( + List quotationMarks, + QuoteConventionSet quoteConventionSet + ) + { + _quoteConventionSet = quoteConventionSet; + GroupQuotationMarks(quotationMarks); + } + + private void GroupQuotationMarks(List quotationMarks) + { + _groupedQuotationMarks = quotationMarks + .GroupBy(qmm => qmm.QuotationMark) + .ToDictionary(g => g.Key, g => g.ToList()); + } + + public IEnumerable<(string Mark1, string Mark2)> GetQuotationMarkPairs() + { + foreach ( + (string mark1, List matches1) in _groupedQuotationMarks.Select(kvp => + (kvp.Key, kvp.Value) + ) + ) + { + // handle cases of identical opening/closing marks + if ( + matches1.Count == 2 + && _quoteConventionSet.IsQuotationMarkDirectionAmbiguous(mark1) + && !HasDistinctPairedQuotationMark(mark1) + ) + { + yield return (mark1, mark1); + continue; + } + + // skip verses where quotation mark pairs are ambiguous + if (matches1.Count > 1) + continue; + + // find matching closing marks + foreach ( + (string mark2, List matches2) in _groupedQuotationMarks.Select(kvp => + (kvp.Key, kvp.Value) + ) + ) + { + if ( + matches2.Count == 1 + && _quoteConventionSet.MarksAreAValidPair(mark1, mark2) + && matches1[0].Precedes(matches2[0]) + ) + { + yield return (mark1, mark2); + } + } + } + } + + public bool HasDistinctPairedQuotationMark(string quotationMark) + { + return _quoteConventionSet + .GetPossiblePairedQuotationMarks(quotationMark) + .Any(m => m != quotationMark && _groupedQuotationMarks.ContainsKey(m)); + } + } + + public class PreliminaryApostropheAnalyzer + { + private static readonly Regex ApostrophePattern = new Regex(@"[\'\u2019]", RegexOptions.Compiled); + private readonly ApostropheProportionStatistics _apostropheProportionStatistics; + private readonly QuotationMarkWordPositions _wordPositionStatistics; + + public PreliminaryApostropheAnalyzer() + { + _apostropheProportionStatistics = new ApostropheProportionStatistics(); + _wordPositionStatistics = new QuotationMarkWordPositions(); + Reset(); + } + + public void Reset() + { + _apostropheProportionStatistics.Reset(); + _wordPositionStatistics.Reset(); + } + + public void ProcessQuotationMarks(List textSegments, List quotationMarks) + { + foreach (TextSegment textSegment in textSegments) + _apostropheProportionStatistics.CountCharacters(textSegment); + foreach (QuotationMarkStringMatch quotationMarkMatch in quotationMarks) + ProcessQuotationMark(quotationMarkMatch); + } + + private void ProcessQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (quotationMarkMatch.QuotationMarkMatches(ApostrophePattern)) + CountApostrophe(quotationMarkMatch); + } + + private void CountApostrophe(QuotationMarkStringMatch apostropheMatch) + { + string apostrophe = apostropheMatch.QuotationMark; + _apostropheProportionStatistics.AddApostrophe(); + if (IsMatchWordInitial(apostropheMatch)) + { + _wordPositionStatistics.CountWordInitialApostrophe(apostrophe); + } + else if (IsMatchMidWord(apostropheMatch)) + { + _wordPositionStatistics.CountMidWordApostrophe(apostrophe); + } + else if (IsMatchWordFinal(apostropheMatch)) + { + _wordPositionStatistics.CountWordFinalApostrophe(apostrophe); + } + } + + private bool IsMatchWordInitial(QuotationMarkStringMatch apostropheMatch) + { + if (apostropheMatch.HasTrailingWhitespace()) + return false; + if (!apostropheMatch.IsAtStartOfSegment && !apostropheMatch.HasLeadingWhitespace()) + return false; + return true; + } + + private bool IsMatchMidWord(QuotationMarkStringMatch apostropheMatch) + { + if (apostropheMatch.HasTrailingWhitespace()) + return false; + if (apostropheMatch.HasLeadingWhitespace()) + return false; + return true; + } + + private bool IsMatchWordFinal(QuotationMarkStringMatch apostropheMatch) + { + if (!apostropheMatch.IsAtEndOfSegment && !apostropheMatch.HasTrailingWhitespace()) + return false; + if (apostropheMatch.HasLeadingWhitespace()) + return false; + return true; + } + + public bool IsApostropheOnly(string mark) + { + if (!ApostrophePattern.IsMatch(mark)) + return false; + + if (_wordPositionStatistics.IsMarkRarelyInitial(mark) || _wordPositionStatistics.IsMarkRarelyInitial(mark)) + return true; + + if ( + _wordPositionStatistics.AreInitialAndFinalRatesSimilar(mark) + && _wordPositionStatistics.IsMarkCommonlyMidWord(mark) + ) + { + return true; + } + + if (_apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.02)) + { + return true; + } + + return false; + } + } + + public class PreliminaryQuotationAnalyzer + { + private readonly QuoteConventionSet _quoteConventions; + private readonly PreliminaryApostropheAnalyzer _apostropheAnalyzer; + private readonly QuotationMarkSequences _quotationMarkSequences; + + public PreliminaryQuotationAnalyzer(QuoteConventionSet quoteConventions) + { + _quoteConventions = quoteConventions; //TODO Naming inconsistency + _apostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + _quotationMarkSequences = new QuotationMarkSequences(); + Reset(); + } + + public void Reset() + { + _apostropheAnalyzer.Reset(); + _quotationMarkSequences.Reset(); + } + + public QuoteConventionSet NarrowDownPossibleQuoteConventions(List chapters) + { + foreach (Chapter chapter in chapters) + AnalyzeQuotationMarksForChapter(chapter); + return SelectCompatibleQuoteConventions(); + } + + private void AnalyzeQuotationMarksForChapter(Chapter chapter) + { + foreach (Verse verse in chapter.Verses) + AnalyzeQuotationMarksForVerse(verse); + } + + private void AnalyzeQuotationMarksForVerse(Verse verse) + { + List quotationMarks = new QuotationMarkFinder( + _quoteConventions + ).FindAllPotentialQuotationMarksInVerse(verse); + AnalyzeQuotationMarkSequence(quotationMarks); + _apostropheAnalyzer.ProcessQuotationMarks(verse.TextSegments, quotationMarks); + } + + private void AnalyzeQuotationMarkSequence(List quotationMarks) + { + var quotationMarkGrouper = new QuotationMarkGrouper(quotationMarks, _quoteConventions); + foreach ((string earlierMark, string laterMark) in quotationMarkGrouper.GetQuotationMarkPairs()) + { + _quotationMarkSequences.RecordEarlierQuotationMark(earlierMark); + _quotationMarkSequences.RecordLaterQuotationMark(laterMark); + } + } + + public QuoteConventionSet SelectCompatibleQuoteConventions() + { + List openingQuotationMarks = FindOpeningQuotationMarks(); + List closingQuotationMarks = FindClosingQuotationMarks(); + + return _quoteConventions.FilterToCompatibleQuoteConventions(openingQuotationMarks, closingQuotationMarks); + } + + private List FindOpeningQuotationMarks() + { + return _quoteConventions + .GetPossibleOpeningQuotationMarks() + .Where(qm => IsOpeningQuotationMark(qm)) + .ToList(); + } + + private bool IsOpeningQuotationMark(string quotationMark) + { + if (_apostropheAnalyzer.IsApostropheOnly(quotationMark)) + return false; + + if (_quotationMarkSequences.IsMarkMuchMoreCommonEarlier(quotationMark)) + return true; + if ( + _quotationMarkSequences.IsMarkCommonEarlyAndLate(quotationMark) + && _quoteConventions.IsQuotationMarkDirectionAmbiguous(quotationMark) + ) + { + return true; + } + return false; + } + + private List FindClosingQuotationMarks() + { + return _quoteConventions + .GetPossibleClosingQuotationMarks() + .Where(qm => IsClosingQuotationMark(qm)) + .ToList(); + } + + private bool IsClosingQuotationMark(string quotationMark) + { + if (_apostropheAnalyzer.IsApostropheOnly(quotationMark)) + return false; + + if (_quotationMarkSequences.IsMarkMuchMoreCommonLater(quotationMark)) + return true; + if ( + _quotationMarkSequences.IsMarkCommonEarlyAndLate(quotationMark) + && _quoteConventions.IsQuotationMarkDirectionAmbiguous(quotationMark) + ) + { + return true; + } + return false; + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs new file mode 100644 index 00000000..e43bb20c --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs @@ -0,0 +1,54 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class QuotationMarkFinder + { + private static readonly Regex QuotePattern = new Regex(@"(\p{Pi}|\p{Pf}|<<|>>|<|>)", RegexOptions.Compiled); + private readonly QuoteConventionSet _quoteConventionSet; + + public QuotationMarkFinder(QuoteConventionSet quoteConventionSet) + { + _quoteConventionSet = quoteConventionSet; + } + + public List FindAllPotentialQuotationMarksInChapter(Chapter chapter) + { + var quotationMatches = new List(); + foreach (Verse verse in chapter.Verses) + quotationMatches.AddRange(FindAllPotentialQuotationMarksInVerse(verse)); + return quotationMatches; + } + + public List FindAllPotentialQuotationMarksInVerse(Verse verse) //TODO excessive? + { + return FindAllPotentialQuotationMarksInTextSegments(verse.TextSegments); + } + + public List FindAllPotentialQuotationMarksInTextSegments( + List textSegments + ) + { + return textSegments.SelectMany(ts => FindAllPotentialQuotationMarksInTextSegment(ts)).ToList(); + } + + public List FindAllPotentialQuotationMarksInTextSegment(TextSegment textSegment) + { + return QuotePattern + .Matches(textSegment.Text) + .Cast() + .Where(match => + _quoteConventionSet.IsValidOpeningQuotationMark(match.Groups[0].Value) + || _quoteConventionSet.IsValidClosingQuotationMark(match.Groups[0].Value) + ) + .Select(m => new QuotationMarkStringMatch( + textSegment, + m.Groups[0].Index, + m.Groups[0].Index + m.Groups[0].Length + )) + .ToList(); + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs index 445b3fb6..eea8376a 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs @@ -30,13 +30,13 @@ public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventionSet) = public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventionSet) => quoteConventionSet.IsValidClosingQuotationMark(QuotationMark); - public bool QuotationMarkMatches(Regex regexPattern) => regexPattern.Matches(QuotationMark).Count > 0; + public bool QuotationMarkMatches(Regex regexPattern) => regexPattern.IsMatch(QuotationMark); public bool NextCharacterMatches(Regex regexPattern) => - NextCharacter != null && regexPattern.Matches(NextCharacter).Count > 0; + NextCharacter != null && regexPattern.IsMatch(NextCharacter); public bool PreviousCharacterMatches(Regex regexPattern) => - PreviousCharacter != null && regexPattern.Matches(PreviousCharacter).Count > 0; + PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); public string PreviousCharacter { @@ -73,10 +73,10 @@ public string NextCharacter } public bool LeadingSubstringMatches(Regex regexPattern) => - regexPattern.Matches(TextSegment.SubstringBefore(StartIndex)).Count > 0; + regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); public bool TrailingSubstringMatches(Regex regexPattern) => - regexPattern.Matches(TextSegment.SubstringAfter(EndIndex)).Count > 0; + regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); // this assumes that the two matches occur in the same verse public bool Precedes(QuotationMarkStringMatch other) diff --git a/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs b/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs index d94bc93b..22521ee1 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs +++ b/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs @@ -155,6 +155,14 @@ public bool MarksAreAValidPair(string openingMark, string closingMark) && set.Contains(closingMark); } + public bool IsQuotationMarkDirectionAmbiguous(string quotationMark) + { + return ( + ClosingMarksByOpeningMark.TryGetValue(quotationMark, out HashSet closingMarks) + && closingMarks.Contains(quotationMark) + ); + } + public HashSet GetPossiblePairedQuotationMarks(string quotationMark) { var pairedQuotationMarks = new HashSet(); diff --git a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs b/src/SIL.Machine/Corpora/Analysis/TextSegment.cs index 7aa89a79..1db66ab3 100644 --- a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs +++ b/src/SIL.Machine/Corpora/Analysis/TextSegment.cs @@ -92,7 +92,7 @@ public void ReplaceSubstring(int startIndex, int endIndex, string replacement) } } - public void SetPreviousSegment(TextSegment previousSegment) + public void SetPreviousSegment(TextSegment previousSegment) //TODO why still here? { _previousSegment = previousSegment; } diff --git a/src/SIL.Machine/Corpora/Analysis/Verse.cs b/src/SIL.Machine/Corpora/Analysis/Verse.cs new file mode 100644 index 00000000..b630faaa --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/Verse.cs @@ -0,0 +1,25 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora.Analysis +{ + public class Verse + { + public List TextSegments { get; private set; } + + public Verse(List textSegments) + { + TextSegments = textSegments; + IndexTextSegments(); + } + + private void IndexTextSegments() + { + foreach ((int index, TextSegment textSegment) in TextSegments.Select((t, i) => (i, t))) + { + textSegment.SetIndexInVerse(index); + textSegment.SetNumSegmentsInVerse(TextSegments.Count); + } + } + } +} From 4388da42bc3e7c6d50021867e38992a5c58c3255 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 9 Jul 2025 16:34:44 -0400 Subject: [PATCH 03/28] All of analysis folder --- src/SIL.Machine/Corpora/Analysis/Chapter.cs | 5 + .../DepthBasedQuotationMarkResolver.cs | 560 ++++++++++++++++++ .../IQuotationMarkResolutionSettings.cs | 17 + .../Analysis/IQuotationMarkResolver.cs | 11 + .../Analysis/PreliminaryQuotationAnalyzer.cs | 4 +- .../Corpora/Analysis/QuotationMarkFinder.cs | 2 +- .../Analysis/QuotationMarkResolutionIssue.cs | 11 + .../Analysis/StandardQuoteConventions.cs | 221 +++++++ .../Corpora/Analysis/TextSegment.cs | 2 +- .../Analysis/UsfmStructureExtractor.cs | 165 ++++++ 10 files changed, 994 insertions(+), 4 deletions(-) create mode 100644 src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs create mode 100644 src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs diff --git a/src/SIL.Machine/Corpora/Analysis/Chapter.cs b/src/SIL.Machine/Corpora/Analysis/Chapter.cs index dc12cfa1..4056d348 100644 --- a/src/SIL.Machine/Corpora/Analysis/Chapter.cs +++ b/src/SIL.Machine/Corpora/Analysis/Chapter.cs @@ -4,6 +4,11 @@ namespace SIL.Machine.Corpora.Analysis { public class Chapter { + public Chapter(List verses) + { + Verses = verses; + } + public List Verses { get; set; } } } diff --git a/src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs new file mode 100644 index 00000000..87e0e26e --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs @@ -0,0 +1,560 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class QuotationMarkResolverState + { + public Stack Quotations { get; private set; } + private int _currentDepth; + + public QuotationMarkResolverState() + { + Reset(); + } + + public void Reset() + { + Quotations = new Stack(); + _currentDepth = 0; + } + + public int CurrentDepth => _currentDepth + 1; + + public bool HasOpenQuotationMark => _currentDepth > 0; + + public bool AreMoreThanNQuotesOpen(int n) => _currentDepth > n; + + public QuotationMarkMetadata AddOpeningQuotationMark(QuotationMarkStringMatch quoteMatch) + { + QuotationMarkMetadata quote = quoteMatch.Resolve(_currentDepth + 1, QuotationMarkDirection.Opening); + Quotations.Push(quote); + _currentDepth++; + return quote; + } + + public QuotationMarkMetadata AddClosingQuotationMark(QuotationMarkStringMatch quoteMatch) + { + QuotationMarkMetadata quote = quoteMatch.Resolve(_currentDepth, QuotationMarkDirection.Closing); + Quotations.Pop(); + _currentDepth++; + return quote; + } + + public string GetOpeningQuotationMarkAtDepth(int depth) + { + if (depth > Quotations.Count) + { + throw new InvalidOperationException( + "GetOpeningQuotationMarkAtDepth() was called with a depth greater than the quotation stack size." + ); + } + return Quotations.ToArray()[depth - 1].QuotationMark; + } + + public string GetDeepestOpeningQuotationMark() + { + if (!HasOpenQuotationMark) + { + throw new InvalidOperationException( + "GetDeepestOpeningQuotationMark() was called with a depth greater than the quotation stack size." + ); + } + return Quotations.Peek().QuotationMark; + } + } + + public enum QuotationContinuerStyle + { + Undetermined, + English, + Spanish + } + + public class QuotationContinuerState + { + private Stack _quotationContinuers; + public QuotationContinuerStyle ContinuerStyle { get; private set; } + public int CurrentDepth { get; private set; } + + public QuotationContinuerState() + { + Reset(); + } + + public void Reset() + { + _quotationContinuers = new Stack(); + CurrentDepth = 0; + ContinuerStyle = QuotationContinuerStyle.Undetermined; + } + + public bool ContinuerHasBeenObserved() + { + return _quotationContinuers.Count > 0; + } + + public QuotationMarkMetadata AddQuotationContinuer( + QuotationMarkStringMatch quoteMatch, + QuotationMarkResolverState quotationMarkResolverState, + QuotationContinuerStyle quotationContinuerStyle + ) + { + QuotationMarkMetadata quote = quoteMatch.Resolve( + _quotationContinuers.Count + 1, + QuotationMarkDirection.Opening + ); + _quotationContinuers.Push(quote); + CurrentDepth++; + ContinuerStyle = quotationContinuerStyle; + if (_quotationContinuers.Count == quotationMarkResolverState.Quotations.Count) + { + _quotationContinuers.Clear(); + CurrentDepth = 0; + } + return quote; + } + } + + public class QuotationMarkCategorizer + { + private static readonly Regex ApostrophePattern = new Regex(@"[\'\u2019\u2018]", RegexOptions.Compiled); + private readonly IQuotationMarkResolutionSettings _settings; + private readonly QuotationMarkResolverState _quotationMarkResolverState; + private readonly QuotationContinuerState _quotationContinuerState; + + public QuotationMarkCategorizer( + IQuotationMarkResolutionSettings quotationMarkResolutionSettings, + QuotationMarkResolverState quotationMarkResolverState, + QuotationContinuerState quotationContinuerState + ) + { + _settings = quotationMarkResolutionSettings; + _quotationMarkResolverState = quotationMarkResolverState; + _quotationContinuerState = quotationContinuerState; + } + + public bool IsEnglishQuotationContinuer( + QuotationMarkStringMatch quoteMatch, + QuotationMarkStringMatch previousMatch, + QuotationMarkStringMatch nextMatch + ) + { + if (_quotationContinuerState.ContinuerStyle == QuotationContinuerStyle.Spanish) + return false; + if (!MeetsQuoteContinuerPrerequisites(quoteMatch)) + return false; + + if (!_quotationContinuerState.ContinuerHasBeenObserved()) + { + if (quoteMatch.StartIndex > 0) + return false; + if ( + quoteMatch.QuotationMark + != _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( + _quotationContinuerState.CurrentDepth + 1 + ) + ) + { + return false; + } + if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) + { + if (nextMatch == null || nextMatch.StartIndex != quoteMatch.EndIndex) + return false; + } + } + else + { + if ( + quoteMatch.QuotationMark + != _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( + _quotationContinuerState.CurrentDepth + 1 + ) + ) + { + return false; + } + } + return true; + } + + public bool IsSpanishQuotationContinuer( + QuotationMarkStringMatch quoteMatch, + QuotationMarkStringMatch nextMatch, + QuotationMarkStringMatch previousMatch + ) + { + if (_quotationContinuerState.ContinuerStyle == QuotationContinuerStyle.English) + return false; + if (!MeetsQuoteContinuerPrerequisites(quoteMatch)) + return false; + + if (!_quotationContinuerState.ContinuerHasBeenObserved()) + { + if (quoteMatch.StartIndex > 0) + return false; + + // this has only been observed with guillemets so far + if (quoteMatch.QuotationMark != "»") + return false; + + if ( + !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( + _quotationContinuerState.CurrentDepth + 1 + ), + quoteMatch.QuotationMark + ) + ) + { + return false; + } + if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) + { + if (nextMatch == null || nextMatch.StartIndex != quoteMatch.EndIndex) + return false; + } + } + else + { + if ( + !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( + _quotationContinuerState.CurrentDepth + 1 + ), + quoteMatch.QuotationMark + ) + ) + { + return false; + } + } + return true; + } + + private bool MeetsQuoteContinuerPrerequisites(QuotationMarkStringMatch quoteMatch) + { + if ( + _settings.ShouldRelyOnParagraphMarkers() + && !quoteMatch.TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) + ) + { + return false; + } + if (!_quotationMarkResolverState.HasOpenQuotationMark) + return false; + return true; + } + + public bool IsOpeningQuote(QuotationMarkStringMatch match) + { + if (!_settings.IsValidOpeningQuotationMark(match)) + return false; + + // if the quote is ambiguous, use whitespace as clue + if (_settings.IsValidClosingQuotationMark(match)) + { + return ( + match.HasLeadingWhitespace() + || MostRecentOpeningMarkImmediatelyPrecedes(match) + || match.HasQuoteIntroducerInLeadingSubstring() + ) && !(match.HasTrailingWhitespace() || match.HasTrailingPunctuation()); + } + return true; + } + + public bool IsClosingQuote(QuotationMarkStringMatch match) + { + if (!_settings.IsValidClosingQuotationMark(match)) + return false; + + // if the quote is ambiguous, use whitespace as clue + if (_settings.IsValidOpeningQuotationMark(match)) + { + return ( + match.HasTrailingWhitespace() + || match.HasTrailingPunctuation() + || match.IsAtEndOfSegment + || match.NextCharacterMatches(_settings.GetClosingQuotationMarkRegex()) + ) && !match.HasLeadingWhitespace(); + } + return true; + } + + public bool IsMalformedOpeningQuote(QuotationMarkStringMatch match) + { + if (!_settings.IsValidOpeningQuotationMark(match)) + return false; + + if (match.HasQuoteIntroducerInLeadingSubstring()) + return true; + + if ( + match.HasLeadingWhitespace() + && match.HasTrailingWhitespace() + && !_quotationMarkResolverState.HasOpenQuotationMark + ) + { + return true; + } + + return false; + } + + public bool IsMalformedClosingQuote(QuotationMarkStringMatch match) + { + if (!_settings.IsValidClosingQuotationMark(match)) + return false; + + return ( + ( + match.IsAtEndOfSegment + || !match.HasTrailingWhitespace() + || (match.HasLeadingWhitespace() && match.HasTrailingWhitespace()) + ) + && _quotationMarkResolverState.HasOpenQuotationMark + && _settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + match.QuotationMark + ) + ); + } + + public bool IsUnpairedClosingQuote(QuotationMarkStringMatch match) + { + if (!_settings.IsValidClosingQuotationMark(match)) + return false; + + if (_quotationMarkResolverState.HasOpenQuotationMark) + return false; + + return !match.HasLeadingWhitespace() && (match.IsAtEndOfSegment || match.HasTrailingWhitespace()); + } + + private bool MostRecentOpeningMarkImmediatelyPrecedes(QuotationMarkStringMatch match) + { + if (!_quotationMarkResolverState.HasOpenQuotationMark) + return false; + + return _quotationMarkResolverState.GetDeepestOpeningQuotationMark() == match.PreviousCharacter; + } + + public bool IsApostrophe(QuotationMarkStringMatch match, QuotationMarkStringMatch nextMatch) + { + if (!match.QuotationMarkMatches(ApostrophePattern)) + return false; + + // Latin letters on both sides of punctuation mark + if ( + match.PreviousCharacter != null + && match.HasLeadingLatinLetter() + && match.NextCharacter != null + && match.HasTrailingLatinLetter() + ) + { + return true; + } + + // potential final s possessive (e.g. Moses') + if ( + match.PreviousCharacterMatches(new Regex(@"s", RegexOptions.Compiled)) + && (match.HasTrailingWhitespace() || match.HasTrailingPunctuation()) + ) + { + // check whether it could be a closing quote + if (!_quotationMarkResolverState.HasOpenQuotationMark) + return true; + if ( + !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + match.QuotationMark + ) + ) + { + return true; + } + if ( + nextMatch != null + && _settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + nextMatch.QuotationMark + ) + ) + { + return true; + } + } + + // for languages that use apostrophes at teh start and end of words + if ( + !_quotationMarkResolverState.HasOpenQuotationMark && match.QuotationMark == "'" + || _quotationMarkResolverState.HasOpenQuotationMark + && !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + match.QuotationMark + ) + ) + { + return true; + } + + return false; + } + } + + public class DepthBasedQuotationMarkResolver : IQuotationMarkResolver + { + private readonly IQuotationMarkResolutionSettings _settings; + private readonly QuotationMarkResolverState _quotationMarkResolverState; + private readonly QuotationContinuerState _quotationContinuerState; + private readonly QuotationMarkCategorizer _quotationMarkCategorizer; + private readonly HashSet _issues; + + public DepthBasedQuotationMarkResolver(IQuotationMarkResolutionSettings settings) + { + _settings = settings; + _quotationMarkResolverState = new QuotationMarkResolverState(); + _quotationContinuerState = new QuotationContinuerState(); + _quotationMarkCategorizer = new QuotationMarkCategorizer( + _settings, + _quotationMarkResolverState, + _quotationContinuerState + ); + _issues = new HashSet(); + } + + public void Reset() + { + _quotationMarkResolverState.Reset(); + _quotationContinuerState.Reset(); + _issues.Clear(); + } + + public IEnumerable ResolveQuotationMarks(List quoteMatches) + { + foreach ((int quoteIndex, QuotationMarkStringMatch quoteMatch) in quoteMatches.Select((q, i) => (i, q))) + { + QuotationMarkStringMatch previousMark = quoteIndex == 0 ? null : quoteMatches[quoteIndex - 1]; + QuotationMarkStringMatch nextMark = + quoteIndex == quoteMatches.Count - 1 ? null : quoteMatches[quoteIndex + 1]; + foreach (QuotationMarkMetadata q in ResolveQuotationMark(quoteMatch, previousMark, nextMark)) + yield return q; + if (_quotationMarkResolverState.HasOpenQuotationMark) + _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + } + } + + public IEnumerable ResolveQuotationMark( + QuotationMarkStringMatch quoteMatch, + QuotationMarkStringMatch previousMatch, + QuotationMarkStringMatch nextMatch + ) + { + if (_quotationMarkCategorizer.IsOpeningQuote(quoteMatch)) + { + if (_quotationMarkCategorizer.IsEnglishQuotationContinuer(quoteMatch, previousMatch, nextMatch)) + { + yield return ProcessQuotationContinuer(quoteMatch, QuotationContinuerStyle.English); + } + else + { + if (IsDepthTooGreat()) + { + _issues.Add(QuotationMarkResolutionIssue.TooDeepNesting); + yield break; + } + + yield return ProcessOpeningMark(quoteMatch); + } + } + else if (_quotationMarkCategorizer.IsApostrophe(quoteMatch, nextMatch)) { } + else if (_quotationMarkCategorizer.IsClosingQuote(quoteMatch)) + { + if (_quotationMarkCategorizer.IsSpanishQuotationContinuer(quoteMatch, previousMatch, nextMatch)) + { + yield return ProcessQuotationContinuer(quoteMatch, QuotationContinuerStyle.Spanish); + } + else if (!_quotationMarkResolverState.HasOpenQuotationMark) + { + _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + yield break; + } + else + { + yield return ProcessClosingMark(quoteMatch); + } + } + else if (_quotationMarkCategorizer.IsMalformedClosingQuote(quoteMatch)) + { + yield return ProcessClosingMark(quoteMatch); + } + else if (_quotationMarkCategorizer.IsMalformedOpeningQuote(quoteMatch)) + { + yield return ProcessOpeningMark(quoteMatch); + } + else if (_quotationMarkCategorizer.IsUnpairedClosingQuote(quoteMatch)) + { + _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + } + else + { + _issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); + } + } + + private QuotationMarkMetadata ProcessQuotationContinuer( + QuotationMarkStringMatch quoteMatch, + QuotationContinuerStyle continuerStyle + ) + { + return _quotationContinuerState.AddQuotationContinuer( + quoteMatch, + _quotationMarkResolverState, + continuerStyle + ); + } + + private bool IsDepthTooGreat() + { + return _quotationMarkResolverState.AreMoreThanNQuotesOpen(3); + } + + private QuotationMarkMetadata ProcessOpeningMark(QuotationMarkStringMatch quoteMatch) + { + if ( + !_settings.MetadataMatchesQuotationMark( + quoteMatch.QuotationMark, + _quotationMarkResolverState.CurrentDepth, + QuotationMarkDirection.Opening + ) + ) + { + _issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); + } + return _quotationMarkResolverState.AddOpeningQuotationMark(quoteMatch); + } + + private QuotationMarkMetadata ProcessClosingMark(QuotationMarkStringMatch quoteMatch) + { + if ( + !_settings.MetadataMatchesQuotationMark( + quoteMatch.QuotationMark, + _quotationMarkResolverState.CurrentDepth - 1, + QuotationMarkDirection.Closing + ) + ) + { + _issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); + } + return _quotationMarkResolverState.AddClosingQuotationMark(quoteMatch); + } + + public HashSet GetIssues() + { + return _issues; + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs b/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs new file mode 100644 index 00000000..517c0c84 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs @@ -0,0 +1,17 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.Analysis +{ + public interface IQuotationMarkResolutionSettings + { + bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch); + bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch); + Regex GetOpeningQuotationMarkRegex(); + Regex GetClosingQuotationMarkRegex(); + bool AreMarksAValidPair(string openingMark, string closingMark); + bool ShouldRelyOnParagraphMarkers(); + HashSet GetPOssibleDepths(string quotationMark, QuotationMarkDirection direction); + bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction); + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs new file mode 100644 index 00000000..1d6ebc1b --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs @@ -0,0 +1,11 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora.Analysis +{ + public interface IQuotationMarkResolver + { + IEnumerable ResolveQuotationMarks(List quoteMatches); + void Reset(); + HashSet GetIssues(); + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs b/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs index ad9d8537..0a7f80eb 100644 --- a/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs +++ b/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs @@ -208,7 +208,7 @@ public bool IsMarkCommonEarlyAndLate(string quotationMark) public class QuotationMarkGrouper { private readonly QuoteConventionSet _quoteConventionSet; - private Dictionary> _groupedQuotationMarks; //TODO Different data structure? + private Dictionary> _groupedQuotationMarks; public QuotationMarkGrouper( List quotationMarks, @@ -387,7 +387,7 @@ public class PreliminaryQuotationAnalyzer public PreliminaryQuotationAnalyzer(QuoteConventionSet quoteConventions) { - _quoteConventions = quoteConventions; //TODO Naming inconsistency + _quoteConventions = quoteConventions; _apostropheAnalyzer = new PreliminaryApostropheAnalyzer(); _quotationMarkSequences = new QuotationMarkSequences(); Reset(); diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs index e43bb20c..6d62510b 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs @@ -22,7 +22,7 @@ public List FindAllPotentialQuotationMarksInChapter(Ch return quotationMatches; } - public List FindAllPotentialQuotationMarksInVerse(Verse verse) //TODO excessive? + public List FindAllPotentialQuotationMarksInVerse(Verse verse) { return FindAllPotentialQuotationMarksInTextSegments(verse.TextSegments); } diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs new file mode 100644 index 00000000..4536e2d0 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs @@ -0,0 +1,11 @@ +namespace SIL.Machine.Corpora.Analysis +{ + public enum QuotationMarkResolutionIssue + { + UnpairedQuotationMark, + TooDeepNesting, + IncompatibleQuotationMark, + AmbiguousQuotationMark, + UnexpectedQuotationMark + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs b/src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs new file mode 100644 index 00000000..710ad309 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs @@ -0,0 +1,221 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora.Analysis +{ + public class StandardQuoteConventions + { + public static QuoteConventionSet QuoteConventionSet = new QuoteConventionSet( + new List + { + new QuoteConvention( + "standard_english", + new List + { + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + } + ), + new QuoteConvention( + "typewriter_english", + new List + { + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "british_english", + new List + { + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + } + ), + new QuoteConvention( + "british_typewriter_english", + new List + { + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + } + ), + new QuoteConvention( + "hybrid_typewriter_english", + new List + { + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + } + ), + new QuoteConvention( + "standard_french", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + } + ), + new QuoteConvention( + "typewriter_french", + new List + { + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + } + ), + new QuoteConvention( + "french_variant", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + } + ), + new QuoteConvention( + "western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + } + ), + new QuoteConvention( + "british_inspired_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + } + ), + new QuoteConvention( + "typewriter_western_european", + new List + { + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "typewriter_western_european_variant", + new List + { + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("<", ">"), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "hybrid_typewriter_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "hybrid_british_typewriter_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + } + ), + new QuoteConvention( + "central_european", + new List + { + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + } + ), + new QuoteConvention( + "central_european_guillemets", + new List + { + new SingleLevelQuoteConvention("\u00bb", "\u00ab"), + new SingleLevelQuoteConvention("\u203a", "\u2039"), + new SingleLevelQuoteConvention("\u00bb", "\u00ab"), + new SingleLevelQuoteConvention("\u203a", "\u2039"), + } + ), + new QuoteConvention( + "standard_swedish", + new List + { + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + } + ), + new QuoteConvention( + "standard_finnish", + new List + { + new SingleLevelQuoteConvention("\u00bb", "\u00bb"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + } + ), + new QuoteConvention( + "eastern_european", + new List + { + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + } + ), + new QuoteConvention( + "standard_russian", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + } + ), + new QuoteConvention( + "standard_arabic", + new List + { + new SingleLevelQuoteConvention("\u201d", "\u201c"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + new SingleLevelQuoteConvention("\u201d", "\u201c"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + } + ), + new QuoteConvention( + "non-standard_arabic", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + } + ), + } + ); + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs b/src/SIL.Machine/Corpora/Analysis/TextSegment.cs index 1db66ab3..7aa89a79 100644 --- a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs +++ b/src/SIL.Machine/Corpora/Analysis/TextSegment.cs @@ -92,7 +92,7 @@ public void ReplaceSubstring(int startIndex, int endIndex, string replacement) } } - public void SetPreviousSegment(TextSegment previousSegment) //TODO why still here? + public void SetPreviousSegment(TextSegment previousSegment) { _previousSegment = previousSegment; } diff --git a/src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs b/src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs new file mode 100644 index 00000000..e19ad271 --- /dev/null +++ b/src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs @@ -0,0 +1,165 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora.Analysis +{ + public class UsfmStructureExtractor : IUsfmParserHandler + { + private readonly List _textSegments; + private TextSegment.Builder _nextTextSegmentBuilder; + + public UsfmStructureExtractor() + { + _textSegments = new List(); + _nextTextSegmentBuilder = new TextSegment.Builder(); + } + + public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + } + + public void EndBook(UsfmParserState state, string marker) { } + + public void EndCell(UsfmParserState state, string marker) { } + + public void EndChar(UsfmParserState state, string marker, IReadOnlyList attributes, bool closed) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + } + + public void EndNote(UsfmParserState state, string marker, bool closed) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void EndPara(UsfmParserState state, string marker) { } + + public void EndRow(UsfmParserState state, string marker) { } + + public void EndSidebar(UsfmParserState state, string marker, bool closed) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void EndTable(UsfmParserState state) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void EndUsfm(UsfmParserState state) { } + + public void GotMarker(UsfmParserState state, string marker) { } + + public void Milestone( + UsfmParserState state, + string marker, + bool startMilestone, + IReadOnlyList attributes + ) { } + + public void OptBreak(UsfmParserState state) { } + + public void Ref(UsfmParserState state, string marker, string display, string target) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void StartBook(UsfmParserState state, string marker, string code) { } + + public void StartCell(UsfmParserState state, string marker, string align, int colspan) { } + + public void StartChar( + UsfmParserState state, + string markerWithoutPlus, + bool unknown, + IReadOnlyList attributes + ) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + } + + public void StartNote(UsfmParserState state, string marker, string caller, string category) { } + + public void StartPara( + UsfmParserState state, + string marker, + bool unknown, + IReadOnlyList attributes + ) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Paragraph); + } + + public void StartRow(UsfmParserState state, string marker) { } + + public void StartSidebar(UsfmParserState state, string marker, string category) { } + + public void StartTable(UsfmParserState state) { } + + public void StartUsfm(UsfmParserState state) { } + + public void Text(UsfmParserState state, string text) + { + if (!state.IsVerseText) + return; + if (text.Length > 0) + { + _nextTextSegmentBuilder.SetText(text); + TextSegment textSegment = _nextTextSegmentBuilder.Build(); + // don't look past verse boundaries, to enable identical functionality in the + // online one-verse-at-a-time (QuotationDenormalizationScriptureUpdateBlockHandler) + // and offline whole-book-at-once settings (QuoteConventionDetector) + if (_textSegments.Count > 0 && !textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) + { + _textSegments[_textSegments.Count - 1].SetNextSegment(textSegment); + textSegment.SetPreviousSegment(_textSegments[_textSegments.Count - 1]); + } + _textSegments.Add(textSegment); + } + _nextTextSegmentBuilder = new TextSegment.Builder(); + } + + public void Unmatched(UsfmParserState state, string marker) { } + + public void Verse(UsfmParserState state, string number, string marker, string altNumber, string pubNumber) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + } + + public List GetChapters() + { + var chapters = new List(); + var currentChapterVerses = new List(); + var currentVerseSegments = new List(); + foreach (TextSegment textSegment in _textSegments) + { + if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) + { + if (currentVerseSegments.Count > 0) + { + currentChapterVerses.Add(new Verse(currentVerseSegments)); + } + currentVerseSegments.Clear(); + } + if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)) + { + if (currentChapterVerses.Count > 0) + { + chapters.Add(new Chapter(currentChapterVerses)); + } + currentChapterVerses.Clear(); + } + currentChapterVerses.Clear(); + } + if (currentVerseSegments.Count > 0) + { + currentChapterVerses.Add(new Verse(currentVerseSegments)); + } + if (currentChapterVerses.Count > 0) + { + chapters.Add(new Chapter(currentChapterVerses)); + } + return chapters; + } + } +} From d715caac42201737ce44535a19c8a237594511ed Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 11 Jul 2025 15:37:18 -0400 Subject: [PATCH 04/28] Another batch --- .../IQuotationMarkResolutionSettings.cs | 2 +- .../Corpora/Analysis/QuotationMarkMetadata.cs | 2 +- .../Corpora/FallbackQuotationMarkResolver.cs | 146 +++++++++++++ ...onDenormalizationUsfmUpdateBlockHandler.cs | 18 ++ .../Corpora/QuotationMarkUpdateFirstPass.cs | 97 +++++++++ .../QuotationMarkUpdateResolutionSettings.cs | 62 ++++++ .../Corpora/QuotationMarkUpdateSettings.cs | 28 +++ .../Corpora/QuotationMarkUpdateStrategy.cs | 9 + ...onventionChangingUsfmUpdateBlockHandler.cs | 199 ++++++++++++++++++ 9 files changed, 561 insertions(+), 2 deletions(-) create mode 100644 src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs create mode 100644 src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs create mode 100644 src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs create mode 100644 src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs create mode 100644 src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs create mode 100644 src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs create mode 100644 src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs diff --git a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs b/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs index 517c0c84..c7135aa8 100644 --- a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs +++ b/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs @@ -11,7 +11,7 @@ public interface IQuotationMarkResolutionSettings Regex GetClosingQuotationMarkRegex(); bool AreMarksAValidPair(string openingMark, string closingMark); bool ShouldRelyOnParagraphMarkers(); - HashSet GetPOssibleDepths(string quotationMark, QuotationMarkDirection direction); + HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction); bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction); } } diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs b/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs index c1b92f26..8f62f174 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs +++ b/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs @@ -7,7 +7,7 @@ public class QuotationMarkMetadata public QuotationMarkDirection Direction { get; } public TextSegment TextSegment { get; } public int StartIndex { get; } - private int EndIndex { get; } + public int EndIndex { get; } public QuotationMarkMetadata( string quotationMark, diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs new file mode 100644 index 00000000..ce8afba4 --- /dev/null +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -0,0 +1,146 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Corpora.Analysis; + +namespace SIL.Machine.Corpora +{ + public class FallbackQuotationMarkResolver : IQuotationMarkResolver + { + private readonly IQuotationMarkResolutionSettings _settings; + private QuotationMarkMetadata _lastQuotationMark; + private readonly HashSet _issues; + + public FallbackQuotationMarkResolver(IQuotationMarkResolutionSettings settings) + { + _settings = settings; + _lastQuotationMark = null; + _issues = new HashSet(); + } + + public void Reset() + { + _lastQuotationMark = null; + _issues.Clear(); + } + + public IEnumerable ResolveQuotationMarks(List quoteMatches) + { + foreach (QuotationMarkStringMatch quoteMatch in quoteMatches) + { + foreach (QuotationMarkMetadata quotationMarkMetadata in ResolveQuotationMark(quoteMatch)) + { + yield return quotationMarkMetadata; + } + } + } + + public IEnumerable ResolveQuotationMark(QuotationMarkStringMatch quoteMatch) + { + if (IsOpeningQuote(quoteMatch)) + { + QuotationMarkMetadata quote = ResolveOpeningMark(quoteMatch); + if (quote != null) + { + yield return quote; + } + else + { + _issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + } + } + else if (IsClosingQuote(quoteMatch)) + { + QuotationMarkMetadata quote = ResolveClosingMark(quoteMatch); + if (quote != null) + { + yield return quote; + } + else + { + _issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + } + } + else + { + _issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); + } + } + + private bool IsOpeningQuote(QuotationMarkStringMatch match) + { + if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) + { + return ( + match.IsAtStartOfSegment + || match.HasLeadingWhitespace() + || DoesMostRecentOpeningMarkImmediatelyPrecede(match) + || match.HasQuoteIntroducerInLeadingSubstring() + ) && !(match.HasTrailingWhitespace() || match.HasTrailingPunctuation()); + } + else if (_settings.IsValidOpeningQuotationMark(match)) + { + return true; + } + + return false; + } + + private bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatch match) + { + if (_lastQuotationMark == null || _lastQuotationMark.Direction != QuotationMarkDirection.Opening) + { + return false; + } + return _lastQuotationMark.TextSegment == match.TextSegment + && _lastQuotationMark.EndIndex == match.StartIndex; + } + + private bool IsClosingQuote(QuotationMarkStringMatch match) + { + if (_settings.IsValidClosingQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) + { + return (match.HasTrailingWhitespace() || match.HasTrailingPunctuation() || match.IsAtEndOfSegment) + && !match.HasLeadingWhitespace(); + } + else if (_settings.IsValidClosingQuotationMark(match)) + { + return true; + } + + return false; + } + + private QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quoteMatch) + { + HashSet possibleDepths = _settings.GetPossibleDepths( + quoteMatch.QuotationMark, + QuotationMarkDirection.Opening + ); + if (possibleDepths.Count == 0) + return null; + + QuotationMarkMetadata quote = quoteMatch.Resolve(possibleDepths.Min(), QuotationMarkDirection.Opening); + _lastQuotationMark = quote; + return quote; + } + + private QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quoteMatch) + { + HashSet possibleDepths = _settings.GetPossibleDepths( + quoteMatch.QuotationMark, + QuotationMarkDirection.Closing + ); + if (possibleDepths.Count == 0) + return null; + + QuotationMarkMetadata quote = quoteMatch.Resolve(possibleDepths.Min(), QuotationMarkDirection.Closing); + _lastQuotationMark = quote; + return quote; + } + + public HashSet GetIssues() + { + return _issues; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..4b6114fa --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs @@ -0,0 +1,18 @@ +using SIL.Machine.Corpora.Analysis; + +namespace SIL.Machine.Corpora +{ + public class QuotationDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler + { + public QuotationDenormalizationUsfmUpdateBlockHandler( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention, + QuotationMarkUpdateSettings settings = null + ) + { + if (settings == null) + settings = new QuotationMarkUpdateSettings(); //TODO pass conventions? + base(sourceQuoteConvention.Normalize(), targetQuoteConvention, settings); + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs new file mode 100644 index 00000000..034024be --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -0,0 +1,97 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Corpora.Analysis; + +namespace SIL.Machine.Corpora +{ + // Determines the best strategy to take for each chapter + public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor + { + private readonly QuoteConvention _sourceQuoteConvention; + private readonly QuoteConvention _targetQuoteConvention; + private readonly QuotationMarkFinder _quotationMarkFinder; + private readonly DepthBasedQuotationMarkResolver _quotationMarkResolver; + private readonly bool _willFallbackModeWork; + + public QuotationMarkUpdateFirstPass( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + { + _sourceQuoteConvention = sourceQuoteConvention; + _targetQuoteConvention = targetQuoteConvention; + _quotationMarkFinder = new QuotationMarkFinder( + new QuoteConventionSet(new List { sourceQuoteConvention, targetQuoteConvention }) + ); + _quotationMarkResolver = new DepthBasedQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention, targetQuoteConvention) + ); + _willFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention); + } + + private bool CheckWhetherFallbackModeWillWork( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + { + var targetMarksBySourceMarks = new Dictionary>(); + foreach (int level in Enumerable.Range(1, sourceQuoteConvention.NumLevels)) //TODO level vs depth + { + string openingQuotationMark = sourceQuoteConvention.GetOpeningQuoteAtLevel(level); + if (!targetMarksBySourceMarks.TryGetValue(openingQuotationMark, out HashSet marks)) + { + marks = new HashSet(); + targetMarksBySourceMarks[openingQuotationMark] = marks; + } + if (level <= targetQuoteConvention.NumLevels) + { + marks.Add(targetQuoteConvention.GetClosingQuoteAtLevel(level)); + } + } + + return !targetMarksBySourceMarks.Keys.Any(sourceMark => targetMarksBySourceMarks[sourceMark].Count > 1); + } + + public List FindBestChapterStrategies() + { + var bestActionsByChapter = new List(); + foreach (Chapter chapter in GetChapters()) + { + bestActionsByChapter.Add(FindBestStrategyForChapter(chapter)); + } + return bestActionsByChapter; + } + + private QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) + { + List quotationMarkMatches = + _quotationMarkFinder.FindAllPotentialQuotationMarksInChapter(chapter); + + _quotationMarkResolver.Reset(); + + // use ToList() to force evaluation of the generator + _quotationMarkResolver.ResolveQuotationMarks(quotationMarkMatches).ToList(); + + return ChooseBestStrategyBasedOnObservedIssues(_quotationMarkResolver.GetIssues()); + } + + private QuotationMarkUpdateStrategy ChooseBestStrategyBasedOnObservedIssues( + HashSet issues + ) //TODO type hinting + { + if (issues.Contains(QuotationMarkResolutionIssue.AmbiguousQuotationMark)) + return QuotationMarkUpdateStrategy.Skip; + + if ( + issues.Contains(QuotationMarkResolutionIssue.UnpairedQuotationMark) + || issues.Contains(QuotationMarkResolutionIssue.TooDeepNesting) + ) + { + if (_willFallbackModeWork) + return QuotationMarkUpdateStrategy.ApplyFallback; + return QuotationMarkUpdateStrategy.Skip; + } + return QuotationMarkUpdateStrategy.ApplyFull; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs new file mode 100644 index 00000000..0973bcd8 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs @@ -0,0 +1,62 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.Analysis +{ + public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings + { + private readonly QuoteConvention _sourceQuoteConvention; + private readonly QuoteConventionSet _quoteConventionSingletonSet; + private readonly QuoteConvention _targetQuoteConvention; + + public QuotationMarkUpdateResolutionSettings( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + { + _sourceQuoteConvention = sourceQuoteConvention; + _quoteConventionSingletonSet = new QuoteConventionSet(new List { sourceQuoteConvention }); //TODO also seems unnecessary to have both. + _targetQuoteConvention = targetQuoteConvention; //TODO unused + } + + public bool AreMarksAValidPair(string openingMark, string closingMark) + { + return _quoteConventionSingletonSet.MarksAreAValidPair(openingMark, closingMark); + } + + public Regex GetClosingQuotationMarkRegex() + { + return _quoteConventionSingletonSet.ClosingQuotationMarkRegex; + } + + public Regex GetOpeningQuotationMarkRegex() + { + return _quoteConventionSingletonSet.OpeningQuotationMarkRegex; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + return _sourceQuoteConvention.GetPossibleDepths(quotationMark, direction); + } + + public bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidClosingQuotationMark(_quoteConventionSingletonSet); + } + + public bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidOpeningQuotationMark(_quoteConventionSingletonSet); + } + + public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) + { + return _sourceQuoteConvention.GetExpectedQuotationMark(depth, direction) == quotationMark; + } + + public bool ShouldRelyOnParagraphMarkers() + { + return false; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs new file mode 100644 index 00000000..3394850f --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs @@ -0,0 +1,28 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public class QuotationMarkUpdateSettings + { + private readonly QuotationMarkUpdateStrategy _defaultChapterAction; + private readonly List _chapterActions; + + public QuotationMarkUpdateSettings( + QuotationMarkUpdateStrategy defaultChapterAction = QuotationMarkUpdateStrategy.ApplyFull, + List chapterActions = null + ) + { + _defaultChapterAction = defaultChapterAction; + _chapterActions = chapterActions ?? new List(); + } + + public QuotationMarkUpdateStrategy GetActionForChapter(int chapterNumber) + { + if (chapterNumber <= _chapterActions.Count) + { + return _chapterActions[chapterNumber - 1]; + } + return _defaultChapterAction; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs new file mode 100644 index 00000000..e6ae10b0 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs @@ -0,0 +1,9 @@ +namespace SIL.Machine.Corpora +{ + public enum QuotationMarkUpdateStrategy + { + ApplyFull, + ApplyFallback, + Skip + } +} diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..d781c98d --- /dev/null +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -0,0 +1,199 @@ +using System.Collections.Generic; +using SIL.Machine.Corpora.Analysis; + +namespace SIL.Machine.Corpora +{ + public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler + { + private readonly QuoteConvention _sourceQuoteConvention; + private readonly QuoteConvention _targetQuoteConvention; + private readonly QuotationMarkUpdateSettings _settings; + private readonly QuotationMarkFinder _quotationMarkFinder; + private TextSegment.Builder _nextScriptureTextSegmentBuilder; + private readonly IQuotationMarkResolver _verseTextQuotationMarkResolver; + private readonly IQuotationMarkResolver _embedQuotationMarkResolver; + private readonly IQuotationMarkResolver _simpleQuotationMarkResolver; + private QuotationMarkUpdateStrategy _currentStrategy; + private int _currentChapterNumber; + private int _currentVerseNumber; + + public QuoteConventionChangingUsfmUpdateBlockHandler( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention, + QuotationMarkUpdateSettings settings + ) + { + _sourceQuoteConvention = sourceQuoteConvention; + _targetQuoteConvention = targetQuoteConvention; + _settings = settings; + + _quotationMarkFinder = new QuotationMarkFinder( + new QuoteConventionSet(new List { _sourceQuoteConvention }) + ); + + _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); + + IQuotationMarkResolutionSettings resolutionSettings = new QuotationMarkUpdateResolutionSettings( + sourceQuoteConvention, + targetQuoteConvention + ); + + // Each embed represents a separate context for quotation marks + // (i.e. you can't open a quote in one context and close it in another) + // so we need to keep track of the verse and embed contexts separately. + _verseTextQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); + _embedQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); + _simpleQuotationMarkResolver = new FallbackQuotationMarkResolver(resolutionSettings); + + _currentStrategy = QuotationMarkUpdateStrategy.ApplyFull; + _currentChapterNumber = 0; + _currentVerseNumber = 0; + } + + public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + { + CheckForChapterChange(block); + CheckForVerseChange(block); + if (_currentStrategy == QuotationMarkUpdateStrategy.Skip) + return block; + if (_currentStrategy == QuotationMarkUpdateStrategy.ApplyFallback) + { + return ApplyFallbackUpdating(block); + } + return ApplyStandardUpdating(block); + } + + private UsfmUpdateBlock ApplyFallbackUpdating(UsfmUpdateBlock block) + { + foreach (UsfmUpdateBlockElement element in block.Elements) //TODO use Elements not _elements + ProcessScriptureElement(element, _simpleQuotationMarkResolver); + return block; + } + + private UsfmUpdateBlock ApplyStandardUpdating(UsfmUpdateBlock block) + { + foreach (UsfmUpdateBlockElement element in block.Elements) //TODO same + { + if (element.Type == UsfmUpdateBlockElementType.Embed) + { + _embedQuotationMarkResolver.Reset(); + ProcessScriptureElement(element, _embedQuotationMarkResolver); + } + else + { + ProcessScriptureElement(element, _verseTextQuotationMarkResolver); + } + } + return block; + } + + private void ProcessScriptureElement( + UsfmUpdateBlockElement element, + IQuotationMarkResolver quotationMarkResolver + ) + { + List textSegments = CreateTextSegments(element); + List quotationMarkMatches = + _quotationMarkFinder.FindAllPotentialQuotationMarksInTextSegments(textSegments); + foreach ( + QuotationMarkMetadata resolvedQuotationMark in quotationMarkResolver.ResolveQuotationMarks( + quotationMarkMatches + ) + ) + { + resolvedQuotationMark.UpdateQuotationMark(_targetQuoteConvention); + } + } + + private List CreateTextSegments(UsfmUpdateBlockElement element) + { + var textSegments = new List(); + foreach (UsfmToken token in element.GetTokens()) + { + switch (token.Type) + { + case UsfmTokenType.Verse: + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + break; + case UsfmTokenType.Paragraph: + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Paragraph); + break; + case UsfmTokenType.Character: + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + break; + case UsfmTokenType.Note: + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + break; + case UsfmTokenType.Text: + TextSegment textSegment = CreateTextSegment(token); + if (textSegment != null) + textSegments.Add(textSegment); + break; + } + } + return SetPreviousAndNextForSegments(textSegments); + } + + private TextSegment CreateTextSegment(UsfmToken token) + { + TextSegment textSegmentToReturn = null; //TODO cleaner + _nextScriptureTextSegmentBuilder.SetUsfmToken(token); + if (token.Text != null) + { + _nextScriptureTextSegmentBuilder.SetText(token.Text); + textSegmentToReturn = _nextScriptureTextSegmentBuilder.Build(); + } + _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); + return textSegmentToReturn; + } + + private List SetPreviousAndNextForSegments(List textSegments) + { + for (int i = 0; i < textSegments.Count; i++) + { + if (i > 0) + textSegments[i].SetPreviousSegment(textSegments[i - 1]); + if (i < textSegments.Count - 1) + textSegments[i].SetNextSegment(textSegments[i + 1]); + } + return textSegments; + } + + private void CheckForChapterChange(UsfmUpdateBlock block) + { + foreach (ScriptureRef scriptureRef in block.Refs) + { + if (scriptureRef.ChapterNum != _currentChapterNumber) + { + _currentChapterNumber = scriptureRef.ChapterNum; + StartNewChapter(_currentChapterNumber); //TODO pass field in method? + } + } + } + + private void StartNewChapter(int newChapterNum) + { + _currentStrategy = _settings.GetActionForChapter(newChapterNum); + _verseTextQuotationMarkResolver.Reset(); + _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + } + + private void CheckForVerseChange(UsfmUpdateBlock block) + { + foreach (ScriptureRef scriptureRef in block.Refs) + { + if (scriptureRef.ChapterNum == _currentChapterNumber && scriptureRef.VerseNum != _currentVerseNumber) + { + _currentVerseNumber = scriptureRef.VerseNum; + StartNewVerse(); //TODO same + unused + } + } + } + + private void StartNewVerse() + { + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + } + } +} From 8465042ae9a4ad92a6e1a5fbbfd159639f1915b3 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 14 Jul 2025 09:03:34 -0400 Subject: [PATCH 05/28] Fix constructor --- .../QuotationDenormalizationUsfmUpdateBlockHandler.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs index 4b6114fa..1b3cb5ff 100644 --- a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs @@ -9,10 +9,10 @@ public QuotationDenormalizationUsfmUpdateBlockHandler( QuoteConvention targetQuoteConvention, QuotationMarkUpdateSettings settings = null ) - { - if (settings == null) - settings = new QuotationMarkUpdateSettings(); //TODO pass conventions? - base(sourceQuoteConvention.Normalize(), targetQuoteConvention, settings); - } + : base( + sourceQuoteConvention.Normalize(), + targetQuoteConvention, + settings == null ? new QuotationMarkUpdateSettings() : null //TODO pass conventions? + ) { } } } From 11dbf251243ccb4fbc8324682ad1b09650d66e62 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 21 Jul 2025 19:26:06 -0400 Subject: [PATCH 06/28] Port some reviewer changes; port some tests --- .../Corpora/FallbackQuotationMarkResolver.cs | 52 +- .../Chapter.cs | 2 +- .../DepthBasedQuotationMarkResolver.cs | 364 +- .../IQuotationMarkResolutionSettings.cs | 2 +- .../IQuotationMarkResolver.cs | 4 +- .../PreliminaryQuotationMarkAnalyzer.cs} | 95 +- .../QuotationMarkDirection.cs | 2 +- .../QuotationMarkFinder.cs | 17 +- .../QuotationMarkMetadata.cs | 2 +- .../QuotationMarkResolutionIssue.cs | 2 +- .../QuotationMarkStringMatch.cs | 50 +- .../QuotationMarkTabulator.cs | 45 +- .../QuoteConvention.cs | 84 +- ...teConventionDetectionResolutionSettings.cs | 55 + .../QuoteConventionDetector.cs | 65 + .../QuoteConventionSet.cs | 104 +- .../StandardQuoteConventions.cs | 4 +- .../TextSegment.cs | 9 +- .../UsfmMarkerType.cs | 2 +- .../UsfmStructureExtractor.cs | 4 +- .../Verse.cs | 2 +- ...onDenormalizationUsfmUpdateBlockHandler.cs | 5 +- .../QuotationMarkDenormalizationFirstPass.cs | 14 + .../Corpora/QuotationMarkUpdateFirstPass.cs | 10 +- .../QuotationMarkUpdateResolutionSettings.cs | 6 +- ...onventionChangingUsfmUpdateBlockHandler.cs | 12 +- src/SIL.Machine/Corpora/UsfmToken.cs | 28 + src/SIL.Machine/SIL.Machine.csproj | 1 + .../DepthBasedQuotationMarkResolverTests.cs | 3464 +++++++++++++++++ .../Corpora/PunctuationAnalysis/temp.cs | 6 + 30 files changed, 4056 insertions(+), 456 deletions(-) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/Chapter.cs (81%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/DepthBasedQuotationMarkResolver.cs (51%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/IQuotationMarkResolutionSettings.cs (93%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/IQuotationMarkResolver.cs (69%) rename src/SIL.Machine/Corpora/{Analysis/PreliminaryQuotationAnalyzer.cs => PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs} (83%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuotationMarkDirection.cs (63%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuotationMarkFinder.cs (74%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuotationMarkMetadata.cs (97%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuotationMarkResolutionIssue.cs (81%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuotationMarkStringMatch.cs (72%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuotationMarkTabulator.cs (65%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuoteConvention.cs (59%) create mode 100644 src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs create mode 100644 src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/QuoteConventionSet.cs (71%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/StandardQuoteConventions.cs (98%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/TextSegment.cs (93%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/UsfmMarkerType.cs (76%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/UsfmStructureExtractor.cs (97%) rename src/SIL.Machine/Corpora/{Analysis => PunctuationAnalysis}/Verse.cs (92%) create mode 100644 src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs index ce8afba4..7b9b4b4e 100644 --- a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; using System.Linq; -using SIL.Machine.Corpora.Analysis; +using SIL.Machine.Corpora.PunctuationAnalysis; namespace SIL.Machine.Corpora { @@ -23,9 +23,11 @@ public void Reset() _issues.Clear(); } - public IEnumerable ResolveQuotationMarks(List quoteMatches) + public IEnumerable ResolveQuotationMarks( + List quotationMarkMatches + ) { - foreach (QuotationMarkStringMatch quoteMatch in quoteMatches) + foreach (QuotationMarkStringMatch quoteMatch in quotationMarkMatches) { foreach (QuotationMarkMetadata quotationMarkMetadata in ResolveQuotationMark(quoteMatch)) { @@ -34,26 +36,26 @@ public IEnumerable ResolveQuotationMarks(List ResolveQuotationMark(QuotationMarkStringMatch quoteMatch) + public IEnumerable ResolveQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - if (IsOpeningQuote(quoteMatch)) + if (IsOpeningQuotationMark(quotationMarkMatch)) { - QuotationMarkMetadata quote = ResolveOpeningMark(quoteMatch); - if (quote != null) + QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch); + if (quotationMark != null) { - yield return quote; + yield return quotationMark; } else { _issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); } } - else if (IsClosingQuote(quoteMatch)) + else if (IsClosingQuotationMark(quotationMarkMatch)) { - QuotationMarkMetadata quote = ResolveClosingMark(quoteMatch); - if (quote != null) + QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch); + if (quotationMark != null) { - yield return quote; + yield return quotationMark; } else { @@ -66,7 +68,7 @@ public IEnumerable ResolveQuotationMark(QuotationMarkStri } } - private bool IsOpeningQuote(QuotationMarkStringMatch match) + private bool IsOpeningQuotationMark(QuotationMarkStringMatch match) { if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) { @@ -95,7 +97,7 @@ private bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatc && _lastQuotationMark.EndIndex == match.StartIndex; } - private bool IsClosingQuote(QuotationMarkStringMatch match) + private bool IsClosingQuotationMark(QuotationMarkStringMatch match) { if (_settings.IsValidClosingQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) { @@ -110,30 +112,36 @@ private bool IsClosingQuote(QuotationMarkStringMatch match) return false; } - private QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quoteMatch) + private QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotationMarkMatch) { HashSet possibleDepths = _settings.GetPossibleDepths( - quoteMatch.QuotationMark, + quotationMarkMatch.QuotationMark, QuotationMarkDirection.Opening ); if (possibleDepths.Count == 0) return null; - QuotationMarkMetadata quote = quoteMatch.Resolve(possibleDepths.Min(), QuotationMarkDirection.Opening); - _lastQuotationMark = quote; - return quote; + QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve( + possibleDepths.Min(), + QuotationMarkDirection.Opening + ); + _lastQuotationMark = quotationMark; + return quotationMark; } - private QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quoteMatch) + private QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotationMarkMatch) { HashSet possibleDepths = _settings.GetPossibleDepths( - quoteMatch.QuotationMark, + quotationMarkMatch.QuotationMark, QuotationMarkDirection.Closing ); if (possibleDepths.Count == 0) return null; - QuotationMarkMetadata quote = quoteMatch.Resolve(possibleDepths.Min(), QuotationMarkDirection.Closing); + QuotationMarkMetadata quote = quotationMarkMatch.Resolve( + possibleDepths.Min(), + QuotationMarkDirection.Closing + ); _lastQuotationMark = quote; return quote; } diff --git a/src/SIL.Machine/Corpora/Analysis/Chapter.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/Chapter.cs similarity index 81% rename from src/SIL.Machine/Corpora/Analysis/Chapter.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/Chapter.cs index 4056d348..8e299d69 100644 --- a/src/SIL.Machine/Corpora/Analysis/Chapter.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/Chapter.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class Chapter { diff --git a/src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs similarity index 51% rename from src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index 87e0e26e..0ea0e807 100644 --- a/src/SIL.Machine/Corpora/Analysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -3,12 +3,11 @@ using System.Linq; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkResolverState { public Stack Quotations { get; private set; } - private int _currentDepth; public QuotationMarkResolverState() { @@ -18,40 +17,44 @@ public QuotationMarkResolverState() public void Reset() { Quotations = new Stack(); - _currentDepth = 0; } - public int CurrentDepth => _currentDepth + 1; + public int CurrentDepth => Quotations.Count; - public bool HasOpenQuotationMark => _currentDepth > 0; + public bool HasOpenQuotationMark => CurrentDepth > 0; - public bool AreMoreThanNQuotesOpen(int n) => _currentDepth > n; + public bool AreMoreThanNQuotesOpen(int n) => CurrentDepth > n; - public QuotationMarkMetadata AddOpeningQuotationMark(QuotationMarkStringMatch quoteMatch) + public QuotationMarkMetadata AddOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - QuotationMarkMetadata quote = quoteMatch.Resolve(_currentDepth + 1, QuotationMarkDirection.Opening); - Quotations.Push(quote); - _currentDepth++; - return quote; + QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve( + CurrentDepth + 1, + QuotationMarkDirection.Opening + ); + Quotations.Push(quotationMark); + return quotationMark; } - public QuotationMarkMetadata AddClosingQuotationMark(QuotationMarkStringMatch quoteMatch) + public QuotationMarkMetadata AddClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - QuotationMarkMetadata quote = quoteMatch.Resolve(_currentDepth, QuotationMarkDirection.Closing); + QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve( + CurrentDepth, + QuotationMarkDirection.Closing + ); Quotations.Pop(); - _currentDepth++; - return quote; + return quotationMark; } public string GetOpeningQuotationMarkAtDepth(int depth) { - if (depth > Quotations.Count) + if (depth > CurrentDepth) { throw new InvalidOperationException( - "GetOpeningQuotationMarkAtDepth() was called with a depth greater than the quotation stack size." + $"Opening quotation mark at depth ${depth} was requested from a quotation stack with depth ${CurrentDepth}." ); } - return Quotations.ToArray()[depth - 1].QuotationMark; + // Stack is stored in reverse order + return Quotations.ToArray()[CurrentDepth - depth].QuotationMark; } public string GetDeepestOpeningQuotationMark() @@ -59,60 +62,57 @@ public string GetDeepestOpeningQuotationMark() if (!HasOpenQuotationMark) { throw new InvalidOperationException( - "GetDeepestOpeningQuotationMark() was called with a depth greater than the quotation stack size." + "The deepest opening quotation mark was requested from an empty quotation stack." ); } return Quotations.Peek().QuotationMark; } } - public enum QuotationContinuerStyle + public enum QuoteContinuerStyle { Undetermined, English, Spanish } - public class QuotationContinuerState + public class QuoteContinuerState { - private Stack _quotationContinuers; - public QuotationContinuerStyle ContinuerStyle { get; private set; } - public int CurrentDepth { get; private set; } + private Stack _quoteContinuerMarks; + public QuoteContinuerStyle ContinuerStyle { get; protected set; } + public int CurrentDepth => _quoteContinuerMarks.Count; - public QuotationContinuerState() + public QuoteContinuerState() { Reset(); } public void Reset() { - _quotationContinuers = new Stack(); - CurrentDepth = 0; - ContinuerStyle = QuotationContinuerStyle.Undetermined; + _quoteContinuerMarks = new Stack(); + ContinuerStyle = QuoteContinuerStyle.Undetermined; } public bool ContinuerHasBeenObserved() { - return _quotationContinuers.Count > 0; + return _quoteContinuerMarks.Count > 0; } - public QuotationMarkMetadata AddQuotationContinuer( - QuotationMarkStringMatch quoteMatch, + public QuotationMarkMetadata AddQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, QuotationMarkResolverState quotationMarkResolverState, - QuotationContinuerStyle quotationContinuerStyle + QuoteContinuerStyle quoteContinuerStyle ) { - QuotationMarkMetadata quote = quoteMatch.Resolve( - _quotationContinuers.Count + 1, + QuotationMarkMetadata quote = quotationMarkMatch.Resolve( + _quoteContinuerMarks.Count + 1, QuotationMarkDirection.Opening ); - _quotationContinuers.Push(quote); - CurrentDepth++; - ContinuerStyle = quotationContinuerStyle; - if (_quotationContinuers.Count == quotationMarkResolverState.Quotations.Count) + _quoteContinuerMarks.Push(quote); + ContinuerStyle = quoteContinuerStyle; + if (_quoteContinuerMarks.Count == quotationMarkResolverState.Quotations.Count) { - _quotationContinuers.Clear(); - CurrentDepth = 0; + _quoteContinuerMarks.Clear(); } return quote; } @@ -123,123 +123,96 @@ public class QuotationMarkCategorizer private static readonly Regex ApostrophePattern = new Regex(@"[\'\u2019\u2018]", RegexOptions.Compiled); private readonly IQuotationMarkResolutionSettings _settings; private readonly QuotationMarkResolverState _quotationMarkResolverState; - private readonly QuotationContinuerState _quotationContinuerState; + private readonly QuoteContinuerState _quoteContinuerState; public QuotationMarkCategorizer( IQuotationMarkResolutionSettings quotationMarkResolutionSettings, QuotationMarkResolverState quotationMarkResolverState, - QuotationContinuerState quotationContinuerState + QuoteContinuerState quotationContinuerState ) { _settings = quotationMarkResolutionSettings; _quotationMarkResolverState = quotationMarkResolverState; - _quotationContinuerState = quotationContinuerState; + _quoteContinuerState = quotationContinuerState; } - public bool IsEnglishQuotationContinuer( - QuotationMarkStringMatch quoteMatch, + public bool IsEnglishQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, QuotationMarkStringMatch previousMatch, QuotationMarkStringMatch nextMatch ) { - if (_quotationContinuerState.ContinuerStyle == QuotationContinuerStyle.Spanish) + if (_quoteContinuerState.ContinuerStyle == QuoteContinuerStyle.Spanish) return false; - if (!MeetsQuoteContinuerPrerequisites(quoteMatch)) + if (!MeetsQuoteContinuerPrerequisites(quotationMarkMatch)) return false; - - if (!_quotationContinuerState.ContinuerHasBeenObserved()) + if ( + quotationMarkMatch.QuotationMark + != _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth(_quoteContinuerState.CurrentDepth + 1) + ) { - if (quoteMatch.StartIndex > 0) - return false; - if ( - quoteMatch.QuotationMark - != _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( - _quotationContinuerState.CurrentDepth + 1 - ) - ) - { + return false; + } + if (!_quoteContinuerState.ContinuerHasBeenObserved()) + { + if (quotationMarkMatch.StartIndex > 0) return false; - } + + // check the next quotation mark match, since quote continuers must appear consecutively if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) { - if (nextMatch == null || nextMatch.StartIndex != quoteMatch.EndIndex) + if (nextMatch == null || nextMatch.StartIndex != quotationMarkMatch.EndIndex) return false; } } - else - { - if ( - quoteMatch.QuotationMark - != _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( - _quotationContinuerState.CurrentDepth + 1 - ) - ) - { - return false; - } - } return true; } - public bool IsSpanishQuotationContinuer( - QuotationMarkStringMatch quoteMatch, - QuotationMarkStringMatch nextMatch, - QuotationMarkStringMatch previousMatch + public bool IsSpanishQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, + QuotationMarkStringMatch previousMatch, + QuotationMarkStringMatch nextMatch ) { - if (_quotationContinuerState.ContinuerStyle == QuotationContinuerStyle.English) + if (_quoteContinuerState.ContinuerStyle == QuoteContinuerStyle.English) return false; - if (!MeetsQuoteContinuerPrerequisites(quoteMatch)) + if (!MeetsQuoteContinuerPrerequisites(quotationMarkMatch)) return false; - if (!_quotationContinuerState.ContinuerHasBeenObserved()) + if ( + !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth(_quoteContinuerState.CurrentDepth + 1), + quotationMarkMatch.QuotationMark + ) + ) + { + return false; + } + + if (!_quoteContinuerState.ContinuerHasBeenObserved()) { - if (quoteMatch.StartIndex > 0) + if (quotationMarkMatch.StartIndex > 0) return false; // this has only been observed with guillemets so far - if (quoteMatch.QuotationMark != "»") + if (quotationMarkMatch.QuotationMark != "»") return false; - if ( - !_settings.AreMarksAValidPair( - _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( - _quotationContinuerState.CurrentDepth + 1 - ), - quoteMatch.QuotationMark - ) - ) - { - return false; - } + // check the next quotation mark match, since quote continuers must appear consecutively if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) { - if (nextMatch == null || nextMatch.StartIndex != quoteMatch.EndIndex) + if (nextMatch == null || nextMatch.StartIndex != quotationMarkMatch.EndIndex) return false; } } - else - { - if ( - !_settings.AreMarksAValidPair( - _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth( - _quotationContinuerState.CurrentDepth + 1 - ), - quoteMatch.QuotationMark - ) - ) - { - return false; - } - } return true; } - private bool MeetsQuoteContinuerPrerequisites(QuotationMarkStringMatch quoteMatch) + private bool MeetsQuoteContinuerPrerequisites(QuotationMarkStringMatch quotationMarkMatch) { if ( _settings.ShouldRelyOnParagraphMarkers() - && !quoteMatch.TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) + && !quotationMarkMatch.TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) ) { return false; @@ -249,52 +222,52 @@ private bool MeetsQuoteContinuerPrerequisites(QuotationMarkStringMatch quoteMatc return true; } - public bool IsOpeningQuote(QuotationMarkStringMatch match) + public bool IsOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - if (!_settings.IsValidOpeningQuotationMark(match)) + if (!_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) return false; // if the quote is ambiguous, use whitespace as clue - if (_settings.IsValidClosingQuotationMark(match)) + if (_settings.IsValidClosingQuotationMark(quotationMarkMatch)) { return ( - match.HasLeadingWhitespace() - || MostRecentOpeningMarkImmediatelyPrecedes(match) - || match.HasQuoteIntroducerInLeadingSubstring() - ) && !(match.HasTrailingWhitespace() || match.HasTrailingPunctuation()); + quotationMarkMatch.HasLeadingWhitespace() + || MostRecentOpeningMarkImmediatelyPrecedes(quotationMarkMatch) + || quotationMarkMatch.HasQuoteIntroducerInLeadingSubstring() + ) && !(quotationMarkMatch.HasTrailingWhitespace() || quotationMarkMatch.HasTrailingPunctuation()); } return true; } - public bool IsClosingQuote(QuotationMarkStringMatch match) + public bool IsClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - if (!_settings.IsValidClosingQuotationMark(match)) + if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) return false; // if the quote is ambiguous, use whitespace as clue - if (_settings.IsValidOpeningQuotationMark(match)) + if (_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) { return ( - match.HasTrailingWhitespace() - || match.HasTrailingPunctuation() - || match.IsAtEndOfSegment - || match.NextCharacterMatches(_settings.GetClosingQuotationMarkRegex()) - ) && !match.HasLeadingWhitespace(); + quotationMarkMatch.HasTrailingWhitespace() + || quotationMarkMatch.HasTrailingPunctuation() + || quotationMarkMatch.IsAtEndOfSegment + || quotationMarkMatch.NextCharacterMatches(_settings.GetClosingQuotationMarkRegex()) + ) && !quotationMarkMatch.HasLeadingWhitespace(); } return true; } - public bool IsMalformedOpeningQuote(QuotationMarkStringMatch match) + public bool IsMalformedOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - if (!_settings.IsValidOpeningQuotationMark(match)) + if (!_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) return false; - if (match.HasQuoteIntroducerInLeadingSubstring()) + if (quotationMarkMatch.HasQuoteIntroducerInLeadingSubstring()) return true; if ( - match.HasLeadingWhitespace() - && match.HasTrailingWhitespace() + quotationMarkMatch.HasLeadingWhitespace() + && quotationMarkMatch.HasTrailingWhitespace() && !_quotationMarkResolverState.HasOpenQuotationMark ) { @@ -304,55 +277,56 @@ public bool IsMalformedOpeningQuote(QuotationMarkStringMatch match) return false; } - public bool IsMalformedClosingQuote(QuotationMarkStringMatch match) + public bool IsMalformedClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - if (!_settings.IsValidClosingQuotationMark(match)) + if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) return false; return ( ( - match.IsAtEndOfSegment - || !match.HasTrailingWhitespace() - || (match.HasLeadingWhitespace() && match.HasTrailingWhitespace()) + quotationMarkMatch.IsAtEndOfSegment + || !quotationMarkMatch.HasTrailingWhitespace() + || (quotationMarkMatch.HasLeadingWhitespace() && quotationMarkMatch.HasTrailingWhitespace()) ) && _quotationMarkResolverState.HasOpenQuotationMark && _settings.AreMarksAValidPair( _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), - match.QuotationMark + quotationMarkMatch.QuotationMark ) ); } - public bool IsUnpairedClosingQuote(QuotationMarkStringMatch match) + public bool IsUnpairedClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) { - if (!_settings.IsValidClosingQuotationMark(match)) + if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) return false; if (_quotationMarkResolverState.HasOpenQuotationMark) return false; - return !match.HasLeadingWhitespace() && (match.IsAtEndOfSegment || match.HasTrailingWhitespace()); + return !quotationMarkMatch.HasLeadingWhitespace() + && (quotationMarkMatch.IsAtEndOfSegment || quotationMarkMatch.HasTrailingWhitespace()); } - private bool MostRecentOpeningMarkImmediatelyPrecedes(QuotationMarkStringMatch match) + private bool MostRecentOpeningMarkImmediatelyPrecedes(QuotationMarkStringMatch quotationMarkMatch) { if (!_quotationMarkResolverState.HasOpenQuotationMark) return false; - return _quotationMarkResolverState.GetDeepestOpeningQuotationMark() == match.PreviousCharacter; + return _quotationMarkResolverState.GetDeepestOpeningQuotationMark() == quotationMarkMatch.PreviousCharacter; } - public bool IsApostrophe(QuotationMarkStringMatch match, QuotationMarkStringMatch nextMatch) + public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationMarkStringMatch nextMatch) { - if (!match.QuotationMarkMatches(ApostrophePattern)) + if (!quotationMarkMatch.QuotationMarkMatches(ApostrophePattern)) return false; // Latin letters on both sides of punctuation mark if ( - match.PreviousCharacter != null - && match.HasLeadingLatinLetter() - && match.NextCharacter != null - && match.HasTrailingLatinLetter() + quotationMarkMatch.PreviousCharacter != null + && quotationMarkMatch.HasLeadingLatinLetter() + && quotationMarkMatch.NextCharacter != null + && quotationMarkMatch.HasTrailingLatinLetter() ) { return true; @@ -360,17 +334,17 @@ public bool IsApostrophe(QuotationMarkStringMatch match, QuotationMarkStringMatc // potential final s possessive (e.g. Moses') if ( - match.PreviousCharacterMatches(new Regex(@"s", RegexOptions.Compiled)) - && (match.HasTrailingWhitespace() || match.HasTrailingPunctuation()) + quotationMarkMatch.PreviousCharacterMatches(new Regex(@"s", RegexOptions.Compiled)) + && (quotationMarkMatch.HasTrailingWhitespace() || quotationMarkMatch.HasTrailingPunctuation()) ) { - // check whether it could be a closing quote + // check whether it could be a closing quotation mark if (!_quotationMarkResolverState.HasOpenQuotationMark) return true; if ( !_settings.AreMarksAValidPair( _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), - match.QuotationMark + quotationMarkMatch.QuotationMark ) ) { @@ -388,13 +362,13 @@ public bool IsApostrophe(QuotationMarkStringMatch match, QuotationMarkStringMatc } } - // for languages that use apostrophes at teh start and end of words + // for languages that use apostrophes at teh start and end of words //TODO misspelled comment if ( - !_quotationMarkResolverState.HasOpenQuotationMark && match.QuotationMark == "'" + !_quotationMarkResolverState.HasOpenQuotationMark && quotationMarkMatch.QuotationMark == "'" || _quotationMarkResolverState.HasOpenQuotationMark && !_settings.AreMarksAValidPair( _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), - match.QuotationMark + quotationMarkMatch.QuotationMark ) ) { @@ -409,7 +383,7 @@ public class DepthBasedQuotationMarkResolver : IQuotationMarkResolver { private readonly IQuotationMarkResolutionSettings _settings; private readonly QuotationMarkResolverState _quotationMarkResolverState; - private readonly QuotationContinuerState _quotationContinuerState; + private readonly QuoteContinuerState _quoteContinuerState; private readonly QuotationMarkCategorizer _quotationMarkCategorizer; private readonly HashSet _issues; @@ -417,11 +391,11 @@ public DepthBasedQuotationMarkResolver(IQuotationMarkResolutionSettings settings { _settings = settings; _quotationMarkResolverState = new QuotationMarkResolverState(); - _quotationContinuerState = new QuotationContinuerState(); + _quoteContinuerState = new QuoteContinuerState(); _quotationMarkCategorizer = new QuotationMarkCategorizer( _settings, _quotationMarkResolverState, - _quotationContinuerState + _quoteContinuerState ); _issues = new HashSet(); } @@ -429,35 +403,41 @@ public DepthBasedQuotationMarkResolver(IQuotationMarkResolutionSettings settings public void Reset() { _quotationMarkResolverState.Reset(); - _quotationContinuerState.Reset(); + _quoteContinuerState.Reset(); _issues.Clear(); } - public IEnumerable ResolveQuotationMarks(List quoteMatches) + public IEnumerable ResolveQuotationMarks( + List quotationMarkMatches + ) { - foreach ((int quoteIndex, QuotationMarkStringMatch quoteMatch) in quoteMatches.Select((q, i) => (i, q))) + foreach ( + (int index, QuotationMarkStringMatch quotationMarkMatch) in quotationMarkMatches.Select( + (q, i) => (i, q) + ) + ) { - QuotationMarkStringMatch previousMark = quoteIndex == 0 ? null : quoteMatches[quoteIndex - 1]; + QuotationMarkStringMatch previousMark = index == 0 ? null : quotationMarkMatches[index - 1]; QuotationMarkStringMatch nextMark = - quoteIndex == quoteMatches.Count - 1 ? null : quoteMatches[quoteIndex + 1]; - foreach (QuotationMarkMetadata q in ResolveQuotationMark(quoteMatch, previousMark, nextMark)) + index == quotationMarkMatches.Count - 1 ? null : quotationMarkMatches[index + 1]; + foreach (QuotationMarkMetadata q in ResolveQuotationMark(quotationMarkMatch, previousMark, nextMark)) yield return q; - if (_quotationMarkResolverState.HasOpenQuotationMark) - _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); } + if (_quotationMarkResolverState.HasOpenQuotationMark) + _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); } public IEnumerable ResolveQuotationMark( - QuotationMarkStringMatch quoteMatch, + QuotationMarkStringMatch quotationMarkMatch, QuotationMarkStringMatch previousMatch, QuotationMarkStringMatch nextMatch ) { - if (_quotationMarkCategorizer.IsOpeningQuote(quoteMatch)) + if (_quotationMarkCategorizer.IsOpeningQuotationMark(quotationMarkMatch)) { - if (_quotationMarkCategorizer.IsEnglishQuotationContinuer(quoteMatch, previousMatch, nextMatch)) + if (_quotationMarkCategorizer.IsEnglishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) { - yield return ProcessQuotationContinuer(quoteMatch, QuotationContinuerStyle.English); + yield return ProcessQuoteContinuer(quotationMarkMatch, QuoteContinuerStyle.English); } else { @@ -467,15 +447,15 @@ QuotationMarkStringMatch nextMatch yield break; } - yield return ProcessOpeningMark(quoteMatch); + yield return ProcessOpeningMark(quotationMarkMatch); } } - else if (_quotationMarkCategorizer.IsApostrophe(quoteMatch, nextMatch)) { } - else if (_quotationMarkCategorizer.IsClosingQuote(quoteMatch)) + else if (_quotationMarkCategorizer.IsApostrophe(quotationMarkMatch, nextMatch)) { } + else if (_quotationMarkCategorizer.IsClosingQuotationMark(quotationMarkMatch)) { - if (_quotationMarkCategorizer.IsSpanishQuotationContinuer(quoteMatch, previousMatch, nextMatch)) + if (_quotationMarkCategorizer.IsSpanishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) { - yield return ProcessQuotationContinuer(quoteMatch, QuotationContinuerStyle.Spanish); + yield return ProcessQuoteContinuer(quotationMarkMatch, QuoteContinuerStyle.Spanish); } else if (!_quotationMarkResolverState.HasOpenQuotationMark) { @@ -484,18 +464,18 @@ QuotationMarkStringMatch nextMatch } else { - yield return ProcessClosingMark(quoteMatch); + yield return ProcessClosingMark(quotationMarkMatch); } } - else if (_quotationMarkCategorizer.IsMalformedClosingQuote(quoteMatch)) + else if (_quotationMarkCategorizer.IsMalformedClosingQuotationMark(quotationMarkMatch)) { - yield return ProcessClosingMark(quoteMatch); + yield return ProcessClosingMark(quotationMarkMatch); } - else if (_quotationMarkCategorizer.IsMalformedOpeningQuote(quoteMatch)) + else if (_quotationMarkCategorizer.IsMalformedOpeningQuotationMark(quotationMarkMatch)) { - yield return ProcessOpeningMark(quoteMatch); + yield return ProcessOpeningMark(quotationMarkMatch); } - else if (_quotationMarkCategorizer.IsUnpairedClosingQuote(quoteMatch)) + else if (_quotationMarkCategorizer.IsUnpairedClosingQuotationMark(quotationMarkMatch)) { _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); } @@ -505,13 +485,13 @@ QuotationMarkStringMatch nextMatch } } - private QuotationMarkMetadata ProcessQuotationContinuer( - QuotationMarkStringMatch quoteMatch, - QuotationContinuerStyle continuerStyle + private QuotationMarkMetadata ProcessQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, + QuoteContinuerStyle continuerStyle ) { - return _quotationContinuerState.AddQuotationContinuer( - quoteMatch, + return _quoteContinuerState.AddQuoteContinuer( + quotationMarkMatch, _quotationMarkResolverState, continuerStyle ); @@ -522,34 +502,34 @@ private bool IsDepthTooGreat() return _quotationMarkResolverState.AreMoreThanNQuotesOpen(3); } - private QuotationMarkMetadata ProcessOpeningMark(QuotationMarkStringMatch quoteMatch) + private QuotationMarkMetadata ProcessOpeningMark(QuotationMarkStringMatch quotationMarkMatch) { if ( !_settings.MetadataMatchesQuotationMark( - quoteMatch.QuotationMark, - _quotationMarkResolverState.CurrentDepth, + quotationMarkMatch.QuotationMark, + _quotationMarkResolverState.CurrentDepth + 1, QuotationMarkDirection.Opening ) ) { _issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); } - return _quotationMarkResolverState.AddOpeningQuotationMark(quoteMatch); + return _quotationMarkResolverState.AddOpeningQuotationMark(quotationMarkMatch); } - private QuotationMarkMetadata ProcessClosingMark(QuotationMarkStringMatch quoteMatch) + private QuotationMarkMetadata ProcessClosingMark(QuotationMarkStringMatch quotationMarkMatch) { if ( !_settings.MetadataMatchesQuotationMark( - quoteMatch.QuotationMark, - _quotationMarkResolverState.CurrentDepth - 1, + quotationMarkMatch.QuotationMark, + _quotationMarkResolverState.CurrentDepth, QuotationMarkDirection.Closing ) ) { _issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); } - return _quotationMarkResolverState.AddClosingQuotationMark(quoteMatch); + return _quotationMarkResolverState.AddClosingQuotationMark(quotationMarkMatch); } public HashSet GetIssues() diff --git a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs similarity index 93% rename from src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs index c7135aa8..19064149 100644 --- a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolutionSettings.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public interface IQuotationMarkResolutionSettings { diff --git a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolver.cs similarity index 69% rename from src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolver.cs index 1d6ebc1b..c7112c33 100644 --- a/src/SIL.Machine/Corpora/Analysis/IQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolver.cs @@ -1,10 +1,10 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public interface IQuotationMarkResolver { - IEnumerable ResolveQuotationMarks(List quoteMatches); + IEnumerable ResolveQuotationMarks(List quotationMarkMatches); void Reset(); HashSet GetIssues(); } diff --git a/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs similarity index 83% rename from src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs index 0a7f80eb..17125f1c 100644 --- a/src/SIL.Machine/Corpora/Analysis/PreliminaryQuotationAnalyzer.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -2,8 +2,9 @@ using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; +using SIL.Extensions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class ApostropheProportionStatistics { @@ -41,9 +42,12 @@ public bool IsApostropheProportionGreaterThan(double threshold) public class QuotationMarkWordPositions { + private static readonly double MaximumProportionForRarity = 0.1; + private static readonly double MaximumProportionDifferenceThreshold = 0.3; private Dictionary _wordInitialOccurrences; private Dictionary _midWordOccurrences; private Dictionary _wordFinalOccurrences; + private Dictionary _totalOccurrences; public QuotationMarkWordPositions() { @@ -55,33 +59,25 @@ public void Reset() _wordInitialOccurrences = new Dictionary(); _midWordOccurrences = new Dictionary(); _wordFinalOccurrences = new Dictionary(); + _totalOccurrences = new Dictionary(); } public void CountWordInitialApostrophe(string quotationMark) { - if (!_wordInitialOccurrences.ContainsKey(quotationMark)) - { - _wordInitialOccurrences[quotationMark] = 0; - } - _wordInitialOccurrences[quotationMark]++; + _wordInitialOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + _totalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); } public void CountMidWordApostrophe(string quotationMark) { - if (!_midWordOccurrences.ContainsKey(quotationMark)) - { - _midWordOccurrences[quotationMark] = 0; - } - _midWordOccurrences[quotationMark]++; + _midWordOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + _totalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); } public void CountWordFinalApostrophe(string quotationMark) { - if (!_wordFinalOccurrences.ContainsKey(quotationMark)) - { - _wordFinalOccurrences[quotationMark] = 0; - } - _wordFinalOccurrences[quotationMark]++; + _wordFinalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + _totalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); } private int GetWordInitialOccurrences(string quotationMark) @@ -110,14 +106,14 @@ public bool IsMarkRarelyInitial(string quotationMark) { int numInitialMarks = GetWordInitialOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (numInitialMarks / numTotalMarks) < 0.1; + return numTotalMarks > 0 && (numInitialMarks / numTotalMarks) < MaximumProportionForRarity; } public bool IsMarkRarelyFinal(string quotationMark) { int numFinalMarks = GetWordFinalOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (numFinalMarks / numTotalMarks) < 0.1; + return numTotalMarks > 0 && (numFinalMarks / numTotalMarks) < MaximumProportionForRarity; } public bool AreInitialAndFinalRatesSimilar(string quotationMark) @@ -125,19 +121,24 @@ public bool AreInitialAndFinalRatesSimilar(string quotationMark) int numInitialMarks = GetWordInitialOccurrences(quotationMark); int numFinalMarks = GetWordFinalOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (Math.Abs(numInitialMarks - numFinalMarks) / numTotalMarks) < 0.3; + return numTotalMarks > 0 + && (Math.Abs(numInitialMarks - numFinalMarks) / numTotalMarks) < MaximumProportionDifferenceThreshold; } public bool IsMarkCommonlyMidWord(string quotationMark) { int numMidWordMarks = GetMidWordOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (numMidWordMarks / numTotalMarks) > 0.3; + return numTotalMarks > 0 && (numMidWordMarks / numTotalMarks) > MaximumProportionDifferenceThreshold; } } public class QuotationMarkSequences { + private static readonly int SoleOccurrenceMinimumCount = 5; + private static readonly int MuchMoreCommonMinimumRatio = 10; + private static readonly double MaximumProportionDifferenceThreshold = 0.2; + private Dictionary _earlierQuotationMarkCounts; private Dictionary _laterQuotationMarkCounts; @@ -152,22 +153,14 @@ public void Reset() _laterQuotationMarkCounts = new Dictionary(); } - public void RecordEarlierQuotationMark(string quotationMark) + public void CountEarlierQuotationMark(string quotationMark) { - if (!_earlierQuotationMarkCounts.ContainsKey(quotationMark)) - { - _earlierQuotationMarkCounts[quotationMark] = 0; - } - _earlierQuotationMarkCounts[quotationMark] += 1; + _earlierQuotationMarkCounts.UpdateValue(quotationMark, () => 0, i => i + 1); } - public void RecordLaterQuotationMark(string quotationMark) + public void CountLaterQuotationMark(string quotationMark) { - if (!_laterQuotationMarkCounts.ContainsKey(quotationMark)) - { - _laterQuotationMarkCounts[quotationMark] = 0; - } - _laterQuotationMarkCounts[quotationMark] += 1; + _laterQuotationMarkCounts.UpdateValue(quotationMark, () => 0, i => i + 1); } private int GetEarlierOccurrences(string quotationMark) @@ -184,30 +177,31 @@ public bool IsMarkMuchMoreCommonEarlier(string quotationMark) { int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); int numLateOccurrences = GetLaterOccurrences(quotationMark); - return (numLateOccurrences == 0 && numEarlyOccurrences > 5) - || numEarlyOccurrences > numLateOccurrences * 10; + return (numLateOccurrences == 0 && numEarlyOccurrences > SoleOccurrenceMinimumCount) + || numEarlyOccurrences > numLateOccurrences * MuchMoreCommonMinimumRatio; } public bool IsMarkMuchMoreCommonLater(string quotationMark) { int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); int numLateOccurrences = GetLaterOccurrences(quotationMark); - return (numEarlyOccurrences == 0 && numLateOccurrences > 5) - || numLateOccurrences > numEarlyOccurrences * 10; + return (numEarlyOccurrences == 0 && numLateOccurrences > SoleOccurrenceMinimumCount) + || numLateOccurrences > numEarlyOccurrences * MuchMoreCommonMinimumRatio; } - public bool IsMarkCommonEarlyAndLate(string quotationMark) + public bool AreEarlyAndLateMarkRatesSimilar(string quotationMark) { int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); int numLateOccurrences = GetLaterOccurrences(quotationMark); return numEarlyOccurrences > 0 - && (Math.Abs(numLateOccurrences - numEarlyOccurrences) / numEarlyOccurrences) < 0.2; + && (Math.Abs(numLateOccurrences - numEarlyOccurrences) / numEarlyOccurrences) + < MaximumProportionDifferenceThreshold; } } public class QuotationMarkGrouper { - private readonly QuoteConventionSet _quoteConventionSet; + private readonly QuoteConventionSet _quoteConventions; private Dictionary> _groupedQuotationMarks; public QuotationMarkGrouper( @@ -215,7 +209,7 @@ public QuotationMarkGrouper( QuoteConventionSet quoteConventionSet ) { - _quoteConventionSet = quoteConventionSet; + _quoteConventions = quoteConventionSet; GroupQuotationMarks(quotationMarks); } @@ -237,7 +231,7 @@ private void GroupQuotationMarks(List quotationMarks) // handle cases of identical opening/closing marks if ( matches1.Count == 2 - && _quoteConventionSet.IsQuotationMarkDirectionAmbiguous(mark1) + && _quoteConventions.IsQuotationMarkDirectionAmbiguous(mark1) && !HasDistinctPairedQuotationMark(mark1) ) { @@ -258,7 +252,7 @@ private void GroupQuotationMarks(List quotationMarks) { if ( matches2.Count == 1 - && _quoteConventionSet.MarksAreAValidPair(mark1, mark2) + && _quoteConventions.MarksAreAValidPair(mark1, mark2) && matches1[0].Precedes(matches2[0]) ) { @@ -270,7 +264,7 @@ private void GroupQuotationMarks(List quotationMarks) public bool HasDistinctPairedQuotationMark(string quotationMark) { - return _quoteConventionSet + return _quoteConventions .GetPossiblePairedQuotationMarks(quotationMark) .Any(m => m != quotationMark && _groupedQuotationMarks.ContainsKey(m)); } @@ -278,6 +272,7 @@ public bool HasDistinctPairedQuotationMark(string quotationMark) public class PreliminaryApostropheAnalyzer { + private static readonly double MaximumApostropheProportion = 0.02; private static readonly Regex ApostrophePattern = new Regex(@"[\'\u2019]", RegexOptions.Compiled); private readonly ApostropheProportionStatistics _apostropheProportionStatistics; private readonly QuotationMarkWordPositions _wordPositionStatistics; @@ -370,7 +365,7 @@ public bool IsApostropheOnly(string mark) return true; } - if (_apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.02)) + if (_apostropheProportionStatistics.IsApostropheProportionGreaterThan(MaximumApostropheProportion)) { return true; } @@ -379,13 +374,13 @@ public bool IsApostropheOnly(string mark) } } - public class PreliminaryQuotationAnalyzer + public class PreliminaryQuotationMarkAnalyzer { private readonly QuoteConventionSet _quoteConventions; private readonly PreliminaryApostropheAnalyzer _apostropheAnalyzer; private readonly QuotationMarkSequences _quotationMarkSequences; - public PreliminaryQuotationAnalyzer(QuoteConventionSet quoteConventions) + public PreliminaryQuotationMarkAnalyzer(QuoteConventionSet quoteConventions) { _quoteConventions = quoteConventions; _apostropheAnalyzer = new PreliminaryApostropheAnalyzer(); @@ -426,8 +421,8 @@ private void AnalyzeQuotationMarkSequence(List quotati var quotationMarkGrouper = new QuotationMarkGrouper(quotationMarks, _quoteConventions); foreach ((string earlierMark, string laterMark) in quotationMarkGrouper.GetQuotationMarkPairs()) { - _quotationMarkSequences.RecordEarlierQuotationMark(earlierMark); - _quotationMarkSequences.RecordLaterQuotationMark(laterMark); + _quotationMarkSequences.CountEarlierQuotationMark(earlierMark); + _quotationMarkSequences.CountLaterQuotationMark(laterMark); } } @@ -455,7 +450,7 @@ private bool IsOpeningQuotationMark(string quotationMark) if (_quotationMarkSequences.IsMarkMuchMoreCommonEarlier(quotationMark)) return true; if ( - _quotationMarkSequences.IsMarkCommonEarlyAndLate(quotationMark) + _quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar(quotationMark) && _quoteConventions.IsQuotationMarkDirectionAmbiguous(quotationMark) ) { @@ -480,7 +475,7 @@ private bool IsClosingQuotationMark(string quotationMark) if (_quotationMarkSequences.IsMarkMuchMoreCommonLater(quotationMark)) return true; if ( - _quotationMarkSequences.IsMarkCommonEarlyAndLate(quotationMark) + _quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar(quotationMark) && _quoteConventions.IsQuotationMarkDirectionAmbiguous(quotationMark) ) { diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkDirection.cs similarity index 63% rename from src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkDirection.cs index 80dd93ac..974955a7 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkDirection.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkDirection.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public enum QuotationMarkDirection { diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs similarity index 74% rename from src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs index 6d62510b..6dc80ef0 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs @@ -2,16 +2,19 @@ using System.Linq; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkFinder { - private static readonly Regex QuotePattern = new Regex(@"(\p{Pi}|\p{Pf}|<<|>>|<|>)", RegexOptions.Compiled); - private readonly QuoteConventionSet _quoteConventionSet; + private static readonly Regex QuotationMarkPattern = new Regex( + @"(\p{Pi}|\p{Pf}|<<|>>|<|>)", + RegexOptions.Compiled + ); + private readonly QuoteConventionSet _quoteConventions; public QuotationMarkFinder(QuoteConventionSet quoteConventionSet) { - _quoteConventionSet = quoteConventionSet; + _quoteConventions = quoteConventionSet; } public List FindAllPotentialQuotationMarksInChapter(Chapter chapter) @@ -36,12 +39,12 @@ List textSegments public List FindAllPotentialQuotationMarksInTextSegment(TextSegment textSegment) { - return QuotePattern + return QuotationMarkPattern .Matches(textSegment.Text) .Cast() .Where(match => - _quoteConventionSet.IsValidOpeningQuotationMark(match.Groups[0].Value) - || _quoteConventionSet.IsValidClosingQuotationMark(match.Groups[0].Value) + _quoteConventions.IsValidOpeningQuotationMark(match.Groups[0].Value) + || _quoteConventions.IsValidClosingQuotationMark(match.Groups[0].Value) ) .Select(m => new QuotationMarkStringMatch( textSegment, diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs similarity index 97% rename from src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs index 8f62f174..b58ec1e1 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkMetadata.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkMetadata { diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkResolutionIssue.cs similarity index 81% rename from src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkResolutionIssue.cs index 4536e2d0..1f06a56d 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkResolutionIssue.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkResolutionIssue.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public enum QuotationMarkResolutionIssue { diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs similarity index 72% rename from src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs index eea8376a..90fe296e 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -1,12 +1,15 @@ using System; +using System.Globalization; using System.Text.RegularExpressions; +using System.Unicode; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkStringMatch { private static readonly Regex LetterPattern = new Regex(@"[\p{L}\uD838[\uDE00-\uDE8F]]", RegexOptions.Compiled); - private static readonly Regex LatinLetterPattern = new Regex(@"^\p{IsBasicLatin}$", RegexOptions.Compiled); + + // No LatinLetterPattern because C# does not support it. Using UnicodeInfo to mirror machine.py private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); @@ -22,13 +25,14 @@ public QuotationMarkStringMatch(TextSegment textSegment, int startIndex, int end EndIndex = endIndex; } - public string QuotationMark => TextSegment.Text.Substring(StartIndex, EndIndex - StartIndex); + public string QuotationMark => + new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex); - public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventionSet) => - quoteConventionSet.IsValidOpeningQuotationMark(QuotationMark); + public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventions) => + quoteConventions.IsValidOpeningQuotationMark(QuotationMark); - public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventionSet) => - quoteConventionSet.IsValidClosingQuotationMark(QuotationMark); + public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventions) => + quoteConventions.IsValidClosingQuotationMark(QuotationMark); public bool QuotationMarkMatches(Regex regexPattern) => regexPattern.IsMatch(QuotationMark); @@ -47,11 +51,14 @@ public string PreviousCharacter TextSegment previousSegment = TextSegment.PreviousSegment; if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) { - return previousSegment.Text[previousSegment.Text.Length - 1].ToString(); + return new StringInfo(previousSegment.Text).SubstringByTextElements( + previousSegment.Text.Length - 1, + 1 + ); } return null; } - return TextSegment.Text[StartIndex - 1].ToString(); + return new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex - 1, 1); } } @@ -64,11 +71,11 @@ public string NextCharacter TextSegment nextSegment = TextSegment.NextSegment; if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) { - return nextSegment.Text[0].ToString(); + return new StringInfo(nextSegment.Text).SubstringByTextElements(0, 1); } return null; } - return TextSegment.Text[EndIndex].ToString(); + return new StringInfo(TextSegment.Text).SubstringByTextElements(EndIndex, 1); } } @@ -138,17 +145,34 @@ public bool HasLetterInTrailingSubstring() public bool HasLeadingLatinLetter() { - return PreviousCharacterMatches(LatinLetterPattern); + return PreviousCharacter != null && IsLatinScript(PreviousCharacter); } public bool HasTrailingLatinLetter() { - return NextCharacterMatches(LatinLetterPattern); + return NextCharacter != null && IsLatinScript(NextCharacter); } public bool HasQuoteIntroducerInLeadingSubstring() { return LeadingSubstringMatches(QuoteIntroducerPattern); } + + private bool IsLatinScript(string characterString) + { + string latinScriptAttribute = "LATIN"; + if (characterString.Length == 1) + { + return UnicodeInfo.GetName(characterString[0]).Contains(latinScriptAttribute); + } + else if (char.IsSurrogatePair(characterString[0], characterString[1])) + { + //Get true unicode value + int combinedCharacterValue = + (((int)characterString[0] - 0xD800) * 0x400) + ((int)characterString[1] - 0xDC00) + 0x10000; + return UnicodeInfo.GetName(combinedCharacterValue).Contains(latinScriptAttribute); + } + return false; + } } } diff --git a/src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs similarity index 65% rename from src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs index 073759c4..68e63912 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -2,39 +2,35 @@ using System.Collections.Generic; using SIL.Extensions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkCounts { - private readonly Dictionary _stringCounts; + private readonly Dictionary _quotationMarkCounter; public int TotalCount { get; private set; } public QuotationMarkCounts() { - _stringCounts = new Dictionary(); + _quotationMarkCounter = new Dictionary(); TotalCount = 0; } public void CountQuotationMark(string quotationMark) { - if (!_stringCounts.ContainsKey(quotationMark)) - { - _stringCounts[quotationMark] = 0; - } - _stringCounts[quotationMark]++; + _quotationMarkCounter.UpdateValue(quotationMark, () => 0, i => i + 1); TotalCount++; } public (string BestString, int BestStringCount, int TotalStringCount) FindBestQuotationMarkProportion() { - string bestString = _stringCounts.MaxBy(kvp => kvp.Value).Key; - return (bestString, _stringCounts[bestString], TotalCount); + string bestString = _quotationMarkCounter.MaxBy(kvp => kvp.Value).Key; + return (bestString, _quotationMarkCounter[bestString], TotalCount); } public int CalculateNumDifferences(string expectedQuotationMark) { - if (!_stringCounts.TryGetValue(expectedQuotationMark, out int count)) + if (!_quotationMarkCounter.TryGetValue(expectedQuotationMark, out int count)) { return TotalCount; } @@ -67,26 +63,17 @@ private void CountQuotationMark(QuotationMarkMetadata quote) { (int Depth, QuotationMarkDirection Direction) key = (quote.Depth, quote.Direction); string quotationMark = quote.QuotationMark; - if (!_quotationCountsByDepthAndDirection.ContainsKey(key)) - { - _quotationCountsByDepthAndDirection[key] = new QuotationMarkCounts(); - } - _quotationCountsByDepthAndDirection[key].CountQuotationMark(quotationMark); + _quotationCountsByDepthAndDirection.UpdateValue( + key, + () => new QuotationMarkCounts(), + counts => + { + counts.CountQuotationMark(quotationMark); + return counts; + } + ); } - // Used in print function - // private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction) => - // _quotationCountsByDepthAndDirection.ContainsKey((depth, direction)); - - // private ( - // string BestQuotationMark, - // int BestQuotationMarkCount, - // int TotalQuotationMarkCount - // ) FindMostCommonQuotationMarkWithDepthAndDirection(int depth, QuotationMarkDirection direction) - // { - // return _quotationCountsByDepthAndDirection[(depth, direction)].FindBestQuotationMarkProportion(); - // } - public double CalculateSimilarity(QuoteConvention quoteConvention) { double numDifferences = 0.0; diff --git a/src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs similarity index 59% rename from src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs index 68404e4f..a5d6c640 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuoteConvention.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs @@ -1,9 +1,7 @@ -using System; using System.Collections.Generic; using System.Linq; -using System.Text; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class SingleLevelQuoteConvention { @@ -22,24 +20,27 @@ public class SingleLevelQuoteConvention { "\u300c", '"' }, { "\u300d", '"' } }; - public string OpeningQuote { get; } - public string ClosingQuote { get; } + public string OpeningQuotationMark { get; } + public string ClosingQuotationMark { get; } - public SingleLevelQuoteConvention(string openingQuote, string closingQuote) + public SingleLevelQuoteConvention(string openingQuotationMark, string closingQuotationMark) { - OpeningQuote = openingQuote; - ClosingQuote = closingQuote; + OpeningQuotationMark = openingQuotationMark; + ClosingQuotationMark = closingQuotationMark; } public SingleLevelQuoteConvention Normalize() { - string normalizedOpeningQuote = QuoteNormalizationMap.TryGetValue(OpeningQuote, out char quote) + string normalizedOpeningQuotationMark = QuoteNormalizationMap.TryGetValue( + OpeningQuotationMark, + out char quote + ) ? quote.ToString() - : OpeningQuote; - string normalizedClosingQuote = QuoteNormalizationMap.TryGetValue(ClosingQuote, out quote) + : OpeningQuotationMark; + string normalizedClosingQuotationMark = QuoteNormalizationMap.TryGetValue(ClosingQuotationMark, out quote) ? quote.ToString() - : ClosingQuote; - return new SingleLevelQuoteConvention(normalizedOpeningQuote, normalizedClosingQuote); + : ClosingQuotationMark; + return new SingleLevelQuoteConvention(normalizedOpeningQuotationMark, normalizedClosingQuotationMark); } } @@ -57,14 +58,14 @@ public QuoteConvention(string name, List levels) public int NumLevels => Levels.Count; - public string GetOpeningQuoteAtLevel(int level) + public string GetOpeningQuotationMarkAtLevel(int level) { - return Levels[level - 1].OpeningQuote; + return Levels[level - 1].OpeningQuotationMark; } - public string GetClosingQuoteAtLevel(int level) + public string GetClosingQuotationMarkAtLevel(int level) { - return Levels[level - 1].ClosingQuote; + return Levels[level - 1].ClosingQuotationMark; } public string GetExpectedQuotationMark(int depth, QuotationMarkDirection direction) @@ -72,15 +73,15 @@ public string GetExpectedQuotationMark(int depth, QuotationMarkDirection directi if (depth > NumLevels || depth < 1) return ""; return direction == QuotationMarkDirection.Opening - ? GetOpeningQuoteAtLevel(depth) - : GetClosingQuoteAtLevel(depth); + ? GetOpeningQuotationMarkAtLevel(depth) + : GetClosingQuotationMarkAtLevel(depth); } private bool IncludesOpeningQuotationMark(string openingQuotationMark) { foreach (SingleLevelQuoteConvention level in Levels) { - if (level.OpeningQuote == openingQuotationMark) + if (level.OpeningQuotationMark == openingQuotationMark) return true; } return false; @@ -90,7 +91,7 @@ private bool IncludesClosingQuotationMark(string closingQuotationMark) { foreach (SingleLevelQuoteConvention level in Levels) { - if (level.ClosingQuote == closingQuotationMark) + if (level.ClosingQuotationMark == closingQuotationMark) return true; } return false; @@ -101,9 +102,9 @@ public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirecti var depths = new HashSet(); foreach ((int depth, SingleLevelQuoteConvention level) in Levels.Select((l, i) => (i + 1, l))) { - if (direction == QuotationMarkDirection.Opening && level.OpeningQuote == quotationMark) + if (direction == QuotationMarkDirection.Opening && level.OpeningQuotationMark == quotationMark) depths.Add(depth); - else if (direction == QuotationMarkDirection.Closing && level.ClosingQuote == quotationMark) + else if (direction == QuotationMarkDirection.Closing && level.ClosingQuotationMark == quotationMark) depths.Add(depth); } return depths; @@ -126,9 +127,9 @@ List closingQuotationMarks } // we require the first-level quotes to have been observed - if (!openingQuotationMarks.Contains(GetOpeningQuoteAtLevel(1))) + if (!openingQuotationMarks.Contains(GetOpeningQuotationMarkAtLevel(1))) return false; - if (!closingQuotationMarks.Contains(GetClosingQuoteAtLevel(1))) + if (!closingQuotationMarks.Contains(GetClosingQuotationMarkAtLevel(1))) return false; return true; } @@ -137,38 +138,5 @@ public QuoteConvention Normalize() { return new QuoteConvention(Name + "_normalized", Levels.Select(l => l.Normalize()).ToList()); } - - public void PrintSummary() - { - Console.WriteLine(GetSummaryMessage()); - } - - private string GetSummaryMessage() - { - var summary = new StringBuilder(Name + "\n"); - foreach ((int level, SingleLevelQuoteConvention convention) in Levels.Select((l, i) => (i, l))) - { - string ordinalName = GetOrdinalName(level + 1); - summary.Append($"{convention.OpeningQuote}{ordinalName}-level quote{convention.ClosingQuote}\n"); - } - return summary.ToString(); - } - - private string GetOrdinalName(int level) - { - switch (level) - { - case 1: - return "First"; - case 2: - return "Second"; - case 3: - return "Third"; - case 4: - return "Fourth"; - default: - return level.ToString() + "th"; - } - } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs new file mode 100644 index 00000000..05583408 --- /dev/null +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs @@ -0,0 +1,55 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace SIL.Machine.Corpora.PunctuationAnalysis +{ + public class QuoteConventionDetectionResolutionSettings : IQuotationMarkResolutionSettings + { + private readonly QuoteConventionSet _quoteConventions; + + public QuoteConventionDetectionResolutionSettings(QuoteConventionSet quoteConventions) + { + _quoteConventions = quoteConventions; + } + + public bool AreMarksAValidPair(string openingMark, string closingMark) + { + return _quoteConventions.MarksAreAValidPair(openingMark, closingMark); + } + + public Regex GetClosingQuotationMarkRegex() + { + return _quoteConventions.ClosingQuotationMarkRegex; + } + + public Regex GetOpeningQuotationMarkRegex() + { + return _quoteConventions.OpeningQuotationMarkRegex; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + return _quoteConventions.GetPossibleDepths(quotationMark, direction); + } + + public bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidClosingQuotationMark(_quoteConventions); + } + + public bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidOpeningQuotationMark(_quoteConventions); + } + + public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) + { + return _quoteConventions.MetadataMatchesQuotationMark(quotationMark, depth, direction); + } + + public bool ShouldRelyOnParagraphMarkers() + { + return true; + } + } +} diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs new file mode 100644 index 00000000..dcef862b --- /dev/null +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs @@ -0,0 +1,65 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora.PunctuationAnalysis +{ + public class QuoteConventionAnalysis + { + public QuoteConvention BestQuoteConvention { get; set; } + public double BestQuoteConventionScore { get; set; } + + public QuoteConventionAnalysis(QuoteConvention bestQuoteConvention, double bestQuoteConventionScore) + { + BestQuoteConvention = bestQuoteConvention; + BestQuoteConventionScore = bestQuoteConventionScore; + } + } + + public class QuoteConventionDetector : UsfmStructureExtractor + { + private readonly QuotationMarkTabulator _quotationMarkTabulator; + + public QuoteConventionDetector() + : base() + { + _quotationMarkTabulator = new QuotationMarkTabulator(); + } + + private void CountQuotationMarksInChapters(List chapters) + { + QuoteConventionSet possibleQuoteConvetions = new PreliminaryQuotationMarkAnalyzer( + StandardQuoteConventions.QuoteConventions + ).NarrowDownPossibleQuoteConventions(chapters); + + foreach (Chapter chapter in chapters) + CountQuotationMarksInChapter(chapter, possibleQuoteConvetions); + } + + private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet possibleQuoteConventions) + { + List quotationMarkMatches = new QuotationMarkFinder( + possibleQuoteConventions + ).FindAllPotentialQuotationMarksInChapter(chapter); + + List resolvedQuotationMarks = new DepthBasedQuotationMarkResolver( + new QuoteConventionDetectionResolutionSettings(possibleQuoteConventions) + ) + .ResolveQuotationMarks(quotationMarkMatches) + .ToList(); + + _quotationMarkTabulator.Tabulate(resolvedQuotationMarks); + } + + public QuoteConventionAnalysis DetectQuotationConvention() + { + CountQuotationMarksInChapters(GetChapters()); + + (QuoteConvention bestQuoteConvention, double score) = + StandardQuoteConventions.QuoteConventions.FindMostSimilarConvention(_quotationMarkTabulator); + + if (score > 0 && bestQuoteConvention != null) + return new QuoteConventionAnalysis(bestQuoteConvention, score); + return null; + } + } +} diff --git a/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs similarity index 71% rename from src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs index 22521ee1..922ec873 100644 --- a/src/SIL.Machine/Corpora/Analysis/QuoteConventionSet.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs @@ -1,9 +1,10 @@ using System.Collections.Generic; +using System.Collections.Immutable; using System.Linq; using System.Text.RegularExpressions; using SIL.Extensions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuoteConventionSet { @@ -19,7 +20,7 @@ public class QuoteConventionSet public QuoteConventionSet(List conventions) { Conventions = conventions; - CreateQuoteRegexes(); + CreateQuotationMarkRegexes(); CreateQuotationMarkPairMap(); } @@ -36,55 +37,42 @@ public override int GetHashCode() return hashCode * 31 + Conventions.GetHashCode(); } - private void CreateQuoteRegexes() + private void CreateQuotationMarkRegexes() { + OpeningQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + ClosingQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + AllQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + var openingQuotationMarks = new HashSet(); var closingQuotationMarks = new HashSet(); - var allQuotationMarks = new HashSet(); - if (Conventions.Count > 0) + foreach (QuoteConvention convention in Conventions) { - foreach (QuoteConvention convention in Conventions) - { - for (int level = 1; level < convention.NumLevels + 1; level++) - { - string openingQuote = convention.GetOpeningQuoteAtLevel(level); - string closingQuote = convention.GetClosingQuoteAtLevel(level); - openingQuotationMarks.Add(openingQuote); - closingQuotationMarks.Add(closingQuote); - allQuotationMarks.Add(openingQuote); - allQuotationMarks.Add(closingQuote); - } - } - - if (allQuotationMarks.Count > 0) + for (int level = 1; level < convention.NumLevels + 1; level++) { - OpeningQuotationMarkRegex = new Regex( - @"[" + string.Join("", openingQuotationMarks.OrderBy(q => q)) + "]", - RegexOptions.Compiled - ); - ClosingQuotationMarkRegex = new Regex( - @"[" + string.Join("", closingQuotationMarks.OrderBy(q => q)) + "]", - RegexOptions.Compiled - ); - AllQuotationMarkRegex = new Regex( - @"[" + string.Join("", allQuotationMarks.OrderBy(q => q)) + "]", - RegexOptions.Compiled - ); + string openingQuote = convention.GetOpeningQuotationMarkAtLevel(level); + string closingQuote = convention.GetClosingQuotationMarkAtLevel(level); + openingQuotationMarks.Add(openingQuote); + closingQuotationMarks.Add(closingQuote); } } - if (openingQuotationMarks.Count == 0) - { - OpeningQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); - } - if (closingQuotationMarks.Count == 0) - { - ClosingQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); - } - if (allQuotationMarks.Count == 0) + var allQuotationMarks = openingQuotationMarks.Union(closingQuotationMarks).ToImmutableHashSet(); + + if (allQuotationMarks.Count > 0) { - AllQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + OpeningQuotationMarkRegex = new Regex( + @"[" + string.Join("", openingQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + ClosingQuotationMarkRegex = new Regex( + @"[" + string.Join("", closingQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + AllQuotationMarkRegex = new Regex( + @"[" + string.Join("", allQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); } } @@ -96,20 +84,30 @@ private void CreateQuotationMarkPairMap() { for (int level = 1; level < convention.NumLevels + 1; level++) { - string openingQuote = convention.GetOpeningQuoteAtLevel(level); - string closingQuote = convention.GetClosingQuoteAtLevel(level); - if (!closingMarksByOpeningMark.ContainsKey(openingQuote)) - { - closingMarksByOpeningMark[openingQuote] = new HashSet(); - } - closingMarksByOpeningMark[openingQuote].Add(closingQuote); - if (!openingMarksByClosingMark.ContainsKey(closingQuote)) - { - openingMarksByClosingMark[closingQuote] = new HashSet(); - } - closingMarksByOpeningMark[closingQuote].Add(openingQuote); + string openingQuote = convention.GetOpeningQuotationMarkAtLevel(level); + string closingQuote = convention.GetClosingQuotationMarkAtLevel(level); + closingMarksByOpeningMark.UpdateValue( + openingQuote, + () => new HashSet(), + set => + { + set.Add(closingQuote); + return set; + } + ); + openingMarksByClosingMark.UpdateValue( + closingQuote, + () => new HashSet(), + set => + { + set.Add(openingQuote); + return set; + } + ); } } + ClosingMarksByOpeningMark = closingMarksByOpeningMark; + OpeningMarksByClosingMark = openingMarksByClosingMark; } public QuoteConvention GetQuoteConventionByName(string name) diff --git a/src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/StandardQuoteConventions.cs similarity index 98% rename from src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/StandardQuoteConventions.cs index 710ad309..c3e3f974 100644 --- a/src/SIL.Machine/Corpora/Analysis/StandardQuoteConventions.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/StandardQuoteConventions.cs @@ -1,10 +1,10 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class StandardQuoteConventions { - public static QuoteConventionSet QuoteConventionSet = new QuoteConventionSet( + public static QuoteConventionSet QuoteConventions = new QuoteConventionSet( new List { new QuoteConvention( diff --git a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs similarity index 93% rename from src/SIL.Machine/Corpora/Analysis/TextSegment.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs index 7aa89a79..10a0ad46 100644 --- a/src/SIL.Machine/Corpora/Analysis/TextSegment.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class TextSegment { @@ -34,7 +34,10 @@ public override bool Equals(object obj) return _text.Equals(t._text) && _indexInVerse.Equals(t._indexInVerse) && _numSegmentsInVerse.Equals(t._numSegmentsInVerse) - && _usfmToken.Equals(t._usfmToken) + && ( + (_usfmToken == null && t._usfmToken == null) + || (_usfmToken != null && t._usfmToken != null && _usfmToken.Equals(t._usfmToken)) + ) && _immediatePrecedingMarker.Equals(t._immediatePrecedingMarker); } @@ -60,7 +63,7 @@ public override int GetHashCode() public string SubstringBefore(int index) { - return _text.Substring(0, _text.Length - index); + return _text.Substring(0, index); } public string SubstringAfter(int index) diff --git a/src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmMarkerType.cs similarity index 76% rename from src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmMarkerType.cs index 9ef2b55a..f3fe1742 100644 --- a/src/SIL.Machine/Corpora/Analysis/UsfmMarkerType.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmMarkerType.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public enum UsfmMarkerType { diff --git a/src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs similarity index 97% rename from src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs index e19ad271..2a76664f 100644 --- a/src/SIL.Machine/Corpora/Analysis/UsfmStructureExtractor.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class UsfmStructureExtractor : IUsfmParserHandler { @@ -107,7 +107,7 @@ public void Text(UsfmParserState state, string text) _nextTextSegmentBuilder.SetText(text); TextSegment textSegment = _nextTextSegmentBuilder.Build(); // don't look past verse boundaries, to enable identical functionality in the - // online one-verse-at-a-time (QuotationDenormalizationScriptureUpdateBlockHandler) + // online one-verse-at-a-time (QuotationMarkDenormalizationScriptureUpdateBlockHandler) // and offline whole-book-at-once settings (QuoteConventionDetector) if (_textSegments.Count > 0 && !textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) { diff --git a/src/SIL.Machine/Corpora/Analysis/Verse.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs similarity index 92% rename from src/SIL.Machine/Corpora/Analysis/Verse.cs rename to src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs index b630faaa..86730b6c 100644 --- a/src/SIL.Machine/Corpora/Analysis/Verse.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Linq; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class Verse { diff --git a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs index 1b3cb5ff..31b4b95d 100644 --- a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs @@ -1,9 +1,10 @@ -using SIL.Machine.Corpora.Analysis; +using SIL.Machine.Corpora.PunctuationAnalysis; namespace SIL.Machine.Corpora { public class QuotationDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler { + // This is a convenience class so that users don't have to know to normalize the source quote convention public QuotationDenormalizationUsfmUpdateBlockHandler( QuoteConvention sourceQuoteConvention, QuoteConvention targetQuoteConvention, @@ -12,7 +13,7 @@ public QuotationDenormalizationUsfmUpdateBlockHandler( : base( sourceQuoteConvention.Normalize(), targetQuoteConvention, - settings == null ? new QuotationMarkUpdateSettings() : null //TODO pass conventions? + settings == null ? new QuotationMarkUpdateSettings() : null ) { } } } diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs new file mode 100644 index 00000000..9a8c0050 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs @@ -0,0 +1,14 @@ +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + // This is a convenience class so that users don't have to know to normalize the source quote convention + public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass + { + public QuotationMarkDenormalizationFirstPass( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + : base(sourceQuoteConvention.Normalize(), targetQuoteConvention) { } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index 034024be..96482599 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; using System.Linq; -using SIL.Machine.Corpora.Analysis; +using SIL.Machine.Corpora.PunctuationAnalysis; namespace SIL.Machine.Corpora { @@ -35,9 +35,9 @@ QuoteConvention targetQuoteConvention ) { var targetMarksBySourceMarks = new Dictionary>(); - foreach (int level in Enumerable.Range(1, sourceQuoteConvention.NumLevels)) //TODO level vs depth + foreach (int level in Enumerable.Range(1, sourceQuoteConvention.NumLevels)) { - string openingQuotationMark = sourceQuoteConvention.GetOpeningQuoteAtLevel(level); + string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtLevel(level); if (!targetMarksBySourceMarks.TryGetValue(openingQuotationMark, out HashSet marks)) { marks = new HashSet(); @@ -45,7 +45,7 @@ QuoteConvention targetQuoteConvention } if (level <= targetQuoteConvention.NumLevels) { - marks.Add(targetQuoteConvention.GetClosingQuoteAtLevel(level)); + marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtLevel(level)); } } @@ -77,7 +77,7 @@ private QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) private QuotationMarkUpdateStrategy ChooseBestStrategyBasedOnObservedIssues( HashSet issues - ) //TODO type hinting + ) { if (issues.Contains(QuotationMarkResolutionIssue.AmbiguousQuotationMark)) return QuotationMarkUpdateStrategy.Skip; diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs index 0973bcd8..6713c56e 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.Analysis +namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings { @@ -15,8 +15,8 @@ QuoteConvention targetQuoteConvention ) { _sourceQuoteConvention = sourceQuoteConvention; - _quoteConventionSingletonSet = new QuoteConventionSet(new List { sourceQuoteConvention }); //TODO also seems unnecessary to have both. - _targetQuoteConvention = targetQuoteConvention; //TODO unused + _quoteConventionSingletonSet = new QuoteConventionSet(new List { sourceQuoteConvention }); + _targetQuoteConvention = targetQuoteConvention; } public bool AreMarksAValidPair(string openingMark, string closingMark) diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs index d781c98d..ef4599b1 100644 --- a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -1,5 +1,5 @@ using System.Collections.Generic; -using SIL.Machine.Corpora.Analysis; +using SIL.Machine.Corpora.PunctuationAnalysis; namespace SIL.Machine.Corpora { @@ -65,14 +65,14 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) private UsfmUpdateBlock ApplyFallbackUpdating(UsfmUpdateBlock block) { - foreach (UsfmUpdateBlockElement element in block.Elements) //TODO use Elements not _elements + foreach (UsfmUpdateBlockElement element in block.Elements) ProcessScriptureElement(element, _simpleQuotationMarkResolver); return block; } private UsfmUpdateBlock ApplyStandardUpdating(UsfmUpdateBlock block) { - foreach (UsfmUpdateBlockElement element in block.Elements) //TODO same + foreach (UsfmUpdateBlockElement element in block.Elements) { if (element.Type == UsfmUpdateBlockElementType.Embed) { @@ -136,7 +136,7 @@ private List CreateTextSegments(UsfmUpdateBlockElement element) private TextSegment CreateTextSegment(UsfmToken token) { - TextSegment textSegmentToReturn = null; //TODO cleaner + TextSegment textSegmentToReturn = null; _nextScriptureTextSegmentBuilder.SetUsfmToken(token); if (token.Text != null) { @@ -166,7 +166,7 @@ private void CheckForChapterChange(UsfmUpdateBlock block) if (scriptureRef.ChapterNum != _currentChapterNumber) { _currentChapterNumber = scriptureRef.ChapterNum; - StartNewChapter(_currentChapterNumber); //TODO pass field in method? + StartNewChapter(_currentChapterNumber); } } } @@ -186,7 +186,7 @@ private void CheckForVerseChange(UsfmUpdateBlock block) if (scriptureRef.ChapterNum == _currentChapterNumber && scriptureRef.VerseNum != _currentVerseNumber) { _currentVerseNumber = scriptureRef.VerseNum; - StartNewVerse(); //TODO same + unused + StartNewVerse(); } } } diff --git a/src/SIL.Machine/Corpora/UsfmToken.cs b/src/SIL.Machine/Corpora/UsfmToken.cs index c0b105b9..2bc97322 100644 --- a/src/SIL.Machine/Corpora/UsfmToken.cs +++ b/src/SIL.Machine/Corpora/UsfmToken.cs @@ -64,6 +64,34 @@ public string NestlessMarker get { return Marker != null && Marker[0] == '+' ? Marker.Substring(1) : Marker; } } + public override bool Equals(object obj) + { + if (obj is UsfmToken other) + { + return Type == other.Type + && Marker == other.Marker + && Text == other.Text + && EndMarker == other.EndMarker + && Data == other.Data + && LineNumber == other.LineNumber + && ColumnNumber == other.ColumnNumber; + } + return false; + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + Type.GetHashCode(); + hashCode = hashCode * 31 + (Marker?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + (Text?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + (EndMarker?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + (Data?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + LineNumber.GetHashCode(); + hashCode = hashCode * 31 + ColumnNumber.GetHashCode(); + return hashCode; + } + public string GetAttribute(string name) { if (Attributes == null || Attributes.Count == 0) diff --git a/src/SIL.Machine/SIL.Machine.csproj b/src/SIL.Machine/SIL.Machine.csproj index 6a7cfbcd..91a88c5d 100644 --- a/src/SIL.Machine/SIL.Machine.csproj +++ b/src/SIL.Machine/SIL.Machine.csproj @@ -43,6 +43,7 @@ + diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs new file mode 100644 index 00000000..eea2f897 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -0,0 +1,3464 @@ +using System.Unicode; +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class DepthBasedQuotationMarkResolverTests +{ + [Test] + public void CurrentDepthQuotationMarkResolverState() + { + var state = new QuotationMarkResolverState(); + Assert.That(state.CurrentDepth, Is.EqualTo(0)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(1)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(1)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(0)); + } + + [Test] + public void HasOpenQuotationMark() + { + var state = new QuotationMarkResolverState(); + Assert.IsFalse(state.HasOpenQuotationMark); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue(state.HasOpenQuotationMark); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue(state.HasOpenQuotationMark); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.IsTrue(state.HasOpenQuotationMark); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse(state.HasOpenQuotationMark); + } + + [Test] + public void AreMoreThanNQuotesOpen() + { + var state = new QuotationMarkResolverState(); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + } + + [Test] + public void GetOpeningQuotationMarkAtDepth() + { + var state = new QuotationMarkResolverState(); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(1)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.That(state.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(2)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.That(state.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.That(state.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("\u2018")); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.That(state.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(1)); + } + + [Test] + public void GetDeepestOpeningMark() + { + var state = new QuotationMarkResolverState(); + Assert.Throws(() => state.GetDeepestOpeningQuotationMark()); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.That(state.GetDeepestOpeningQuotationMark(), Is.EqualTo("\u201c")); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.That(state.GetDeepestOpeningQuotationMark(), Is.EqualTo("\u2018")); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.That(state.GetDeepestOpeningQuotationMark(), Is.EqualTo("\u201c")); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.Throws(() => state.GetDeepestOpeningQuotationMark()); + } + + [Test] + public void GetCurrentDepthQuotationContinuerState() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(0)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(1)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(2)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(0)); + } + + [Test] + public void HasContinuerBeenObserved() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + Assert.IsFalse(continuerState.ContinuerHasBeenObserved()); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.IsTrue(continuerState.ContinuerHasBeenObserved()); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.IsTrue(continuerState.ContinuerHasBeenObserved()); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.IsFalse(continuerState.ContinuerHasBeenObserved()); + } + + [Test] + public void GetContinuerStyle() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Undetermined)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.English)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.Spanish + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Spanish)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.English)); + } + + [Test] + public void AddQuotationContinuer() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + + var result1 = continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That( + result1, + Is.EqualTo( + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ) + ) + ); + + var result2 = continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.Spanish + ); + Assert.That( + result2, + Is.EqualTo( + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\u2018").Build(), + 0, + 1 + ) + ) + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Spanish)); + + var result3 = continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That( + result3, + Is.EqualTo( + new QuotationMarkMetadata( + "\u201c", + 3, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ) + ) + ); + } + + [Test] + public void IsEnglishQuotationContinuer() + { + var standardEnglish = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(standardEnglish); + + var settings = new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([standardEnglish])); + var resolverState = new QuotationMarkResolverState(); + var continuerState = new TestQuoteContinuerState(); + var categorizer = new QuotationMarkCategorizer(settings, resolverState, continuerState); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + // Should always be false if the continuer style is Spanish + continuerState.InternalContinuerStyle = QuoteContinuerStyle.English; + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + continuerState.InternalContinuerStyle = QuoteContinuerStyle.Spanish; + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + continuerState.InternalContinuerStyle = QuoteContinuerStyle.English; + + // Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201ctest").Build(), 0, 1), + null, + null + ) + ); + + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + var categorizerForDenorm = new QuotationMarkCategorizer( + new QuotationMarkUpdateResolutionSettings(standardEnglish, standardEnglish), + resolverState, + continuerState + ); + Assert.IsTrue( + categorizerForDenorm.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201ctest").Build(), 0, 1), + null, + null + ) + ); + + // Should be false if there are no open quotation marks + var emptyState = new QuotationMarkResolverState(); + var emptyCategorizer = new QuotationMarkCategorizer(settings, emptyState, continuerState); + Assert.IsFalse( + emptyCategorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // Should be false if the starting index of the quotation mark is greater than 0 + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText(" \u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + // Should be false if the mark does not match the already opened mark + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // If there are multiple open quotes, the next quote continuer must follow immediately after the current one + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + + // When there are multiple open quotes, the continuer must match the deepest observed mark + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + resolverState, + QuoteContinuerStyle.English + ); + + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + resolverState, + QuoteContinuerStyle.English + ); + + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + } + + [Test] + public void IsSpanishQuotationContinuer() + { + var westernEuropeanQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "western_european" + ); + Assert.IsNotNull(westernEuropeanQuoteConvention); + + var settings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([westernEuropeanQuoteConvention]) + ); + var resolverState = new QuotationMarkResolverState(); + var continuerState = new TestQuoteContinuerState(); + var categorizer = new QuotationMarkCategorizer(settings, resolverState, continuerState); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00ab").Build(), 0, 1) + ); + + // Should always be false if the continuer style is Spanish + continuerState.InternalContinuerStyle = QuoteContinuerStyle.Spanish; + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + continuerState.InternalContinuerStyle = QuoteContinuerStyle.English; + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + continuerState.InternalContinuerStyle = QuoteContinuerStyle.Spanish; + + // Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bbtest").Build(), 0, 1), + null, + null + ) + ); + + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + var categorizerForDenorm = new QuotationMarkCategorizer( + new QuotationMarkUpdateResolutionSettings(westernEuropeanQuoteConvention, westernEuropeanQuoteConvention), + resolverState, + continuerState + ); + Assert.IsTrue( + categorizerForDenorm.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bbtest").Build(), 0, 1), + null, + null + ) + ); + + // Should be false if there are no open quotation marks + var emptyState = new QuotationMarkResolverState(); + var emptyCategorizer = new QuotationMarkCategorizer(settings, emptyState, continuerState); + Assert.IsFalse( + emptyCategorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // Should be false if the starting index of the quotation mark is greater than 0 + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText(" \u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + // Should be false if the mark does not match the already opened mark + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // If there are multiple open quotes, the next quote continuer must follow immediately after the current one + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + + // When there are multiple open quotes, the continuer must match the deepest observed mark + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + resolverState, + QuoteContinuerStyle.Spanish + ); + + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201cbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u2019test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u2019test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + resolverState, + QuoteContinuerStyle.Spanish + ); + + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u2019test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + } + + [Test] + public void IsOpeningQuote() + { + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var britishEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid opening marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + // Leading whitespace is not necessary for unambiguous opening quotes + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201e").Build(), 4, 5) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201a").Build(), 4, 5) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201c").Build(), 4, 5) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u2018").Build(), 4, 5) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201e").Build(), 4, 5) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201a").Build(), 4, 5) + ) + ); + + // An ambiguous quotation mark (opening/closing) is recognized as opening if it has a quote introducer beforehand + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(":\u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201c").Build(), 1, 2) + ) + ); + + // An ambiguous quotation mark (opening/closing) is recognized as opening if preceded by another opening mark + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); //TODO 0,1 not 1,2? + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201c").Build(), 1, 2) + ) + ); + + // An ambiguous quotation mark (opening/closing) is not recognized as opening if it has trailing whitespace or punctuation + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d.").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019?").Build(), 1, 2) + ) + ); + } + + [Test] + public void IsClosingQuote() + { + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var britishEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var standardFrenchQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_french") + ); + Assert.IsNotNull(standardFrenchQuoteConvention); + var standardFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardFrenchQuoteConvention]) + ); + var standardFrenchQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardFrenchResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid closing marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201a ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + // Trailing whitespace is not necessary for unambiguous closing quotes + Assert.IsTrue( + standardFrenchQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bbtext").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardFrenchQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u203atext").Build(), 0, 1) + ) + ); + + // An ambiguous quotation mark (opening/closing) is recognized as closing if + // followed by whitespace, punctuation or the end of the segment + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201dtext").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019text").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019?").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201ctext").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c?").Build(), 0, 1) + ) + ); + + // An ambiguous quotation mark (opening/closing) is not recognized as opening if + // it has leading whitespace + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\t\u201c?").Build(), 1, 2) + ) + ); + } + + [Test] + public void IsMalformedOpeningQuote() + { + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var britishEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid opening marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + // Should return true if there is a leading quote introducer + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(":\u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201c ").Build(), 1, 2) + ) + ); + + // Should return false unless the mark has leading and trailing whitespace + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + + // Should return false if there is already an open quotation mark on the stack + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + } + + [Test] + public void IsMalformedClosingQuote() + { + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var britishEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid closing marks under the quote convention + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e").Build(), 0, 1) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201a").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + // Returns true if it's at the end of the segment + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + + // Returns true if it does not have trailing whitespace + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d-").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201dtext").Build(), 0, 1) + ) + ); + + // Returns true if it has trailing and leading whitespace + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + + // Requires there to be an open quotation mark on the stack + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + + // Requires the quotation mark on the stack to be a valid pair with the + // observed quotation mark + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + } + + [Test] + public void IsUnpairedClosingQuote() + { + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var britishEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid closing marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201a").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + // There must not be an opening quotation mark on the stack + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + + // There must not be leading whitespace + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\t\u2019").Build(), 1, 2) + ) + ); + + // The quotation mark must be either at the end of the segment + // or have trailing whitespace + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d?").Build(), 0, 1) + ) + ); + } + + [Test] + public void IsApostrophe() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var standardEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var typewriterEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + typewriterEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // The quotation mark must make for a plausible apostrophe + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a'b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2018b").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u201cb").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\"b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a'b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2018b").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u201cb").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\"b").Build(), 1, 2), + null + ) + ); + + // Returns true if the mark has Latin letters on both sides + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019Ƅ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("ǡ\u2019b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("ᴀ\u2019B").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("𝼀\u2019Ꝙ").Build(), 1, 2), + null + ) + ); + var charInfo = UnicodeInfo.GetCharInfo('ℵ'); + + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019ℵ").Build(), 1, 2), + null + ) + ); + + // Recognizes s possessives (e.G. Moses') + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("Moses\u2019 ").Build(), 5, 6), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019?").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u20195").Build(), 1, 2), + null + ) + ); + + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + null + ) + ); + + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ); + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word\u2019").Build(), 4, 5) + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word\u201d").Build(), 4, 5) + ) + ); + + // the straight quote should always be an apostrophe if it's not a valid quotation mark + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5'ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" ' ").Build(), 1, 2), + null + ) + ); + + // the straight quote should be an apostrophe if there's nothing on the quotation mark stack + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5'ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" ' ").Build(), 1, 2), + null + ) + ); + + // any matching mark should be an apostrophe if it doesn't pair with the + // deepest opening quotation mark on the stack + // (opening/closing quotation marks will have been detected before calling this) + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5'ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" ' ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5\u2018ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5\u2019ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2), + null + ) + ); + } + + [Test] + public void DepthBasedQuotationMarkResolverReset() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201cThis is a quote").Build(), 0, 1)] + ) + .ToList(); + + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + + standardEnglishQuotationMarkResolver.Reset(); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("This is a quote\u2019").Build(), + 15, + 16 + ) + ] + ) + .ToList(); + + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + } + + [Test] + public void BasicQuotationMarkRecognition() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void ResolutionOnlyOfPassedMatches() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 0, 1),]) + .SequenceEqual( + [new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1),] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + + textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 17, 18),]) + .Count(), + Is.EqualTo(0) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + } + + [Test] + public void ResolutionAcrossSegments() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a ").Build(); + var textSegment2 = new TextSegment.Builder().SetText("\u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment2, 0, 1), + new QuotationMarkStringMatch(textSegment2, 6, 7), + new QuotationMarkStringMatch(textSegment2, 7, 8), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment2, 0, 1), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment2, 6, 7), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment2, 7, 8), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void ResolutionWithApostrophes() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment = ( + new TextSegment.Builder() + .SetText("\u201cThis\u2019 is a \u2018quote\u2019\u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 5, 6), + new QuotationMarkStringMatch(textSegment, 12, 13), + new QuotationMarkStringMatch(textSegment, 18, 19), + new QuotationMarkStringMatch(textSegment, 19, 20), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 12, 13), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 18, 19), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 19, 20), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + + var typewriterEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterEnglishResolverSettings + ); + + textSegment = new TextSegment.Builder() + .SetText("\"This' is a 'quote'\"") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 5, 6), + new QuotationMarkStringMatch(textSegment, 12, 13), + new QuotationMarkStringMatch(textSegment, 18, 19), + new QuotationMarkStringMatch(textSegment, 19, 20), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, textSegment, 12, 13), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, textSegment, 18, 19), + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Closing, textSegment, 19, 20), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void EnglishQuoteContinuers() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote").Build(); + var textSegment2 = ( + new TextSegment.Builder() + .SetText("\u201c\u2018This is the rest\u2019 of it\u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment1, 11, 12), + new QuotationMarkStringMatch(textSegment2, 0, 1), + new QuotationMarkStringMatch(textSegment2, 1, 2), + new QuotationMarkStringMatch(textSegment2, 18, 19), + new QuotationMarkStringMatch(textSegment2, 25, 26), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment1, 11, 12), + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment2, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment2, 1, 2), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment2, 18, 19), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment2, 25, 26), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void SpanishQuoteContinuers() + { + var westernEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("western_european") + ); + Assert.IsNotNull(westernEuropeanQuoteConvention); + var westernEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([westernEuropeanQuoteConvention]) + ); + var westernEuropeanQuotationMarkResolver = new DepthBasedQuotationMarkResolver(westernEuropeanResolverSettings); + + var textSegment1 = new TextSegment.Builder().SetText("\u00abThis is a \u201cquote").Build(); + var textSegment2 = ( + new TextSegment.Builder() + .SetText("\u00bb\u201dThis is the rest\u201d of it\u00bb") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + westernEuropeanQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment1, 11, 12), + new QuotationMarkStringMatch(textSegment2, 0, 1), + new QuotationMarkStringMatch(textSegment2, 1, 2), + new QuotationMarkStringMatch(textSegment2, 18, 19), + new QuotationMarkStringMatch(textSegment2, 25, 26), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, textSegment1, 11, 12), + new QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Opening, textSegment2, 0, 1), + new QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Opening, textSegment2, 1, 2), + new QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, textSegment2, 18, 19), + new QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, textSegment2, 25, 26), + ] + ) + ); + Assert.That(westernEuropeanQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void MalformedQuotationMarks() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment1 = new TextSegment.Builder().SetText("\u201c This is a,\u2018 quote").Build(); + var textSegment2 = ( + new TextSegment.Builder() + .SetText("This is the rest \u2019 of it \u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment1, 12, 13), + new QuotationMarkStringMatch(textSegment2, 17, 18), + new QuotationMarkStringMatch(textSegment2, 25, 26), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment1, 12, 13), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment2, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment2, 25, 26), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void UnpairedQuotationMarkIssue() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + ] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + + textSegment = new TextSegment.Builder().SetText("another quote\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 13, 14),]) + .SequenceEqual( + [new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 13, 14),] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + } + + [Test] + public void TooDeepNestingIssue() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment = new TextSegment.Builder() + .SetText("\u201cThis \u2018is \u201ca \u2018quote \u201cnested too deeply") + .Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 6, 7), + new QuotationMarkStringMatch(textSegment, 10, 11), + new QuotationMarkStringMatch(textSegment, 13, 14), + new QuotationMarkStringMatch(textSegment, 20, 21), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 6, 7), + new QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, textSegment, 10, 11), + new QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, textSegment, 13, 14), + // new QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, textSegment, 20, 21), //TODO Why commented out? + ] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.UnpairedQuotationMark,] + ) + ); + } + + [Test] + public void IncompatibleQuotationMarkIssue() + { + var standardEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u201cquote\u201d\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.IncompatibleQuotationMark]) + ); + } + + [Test] + public void AmbiguousQuotationMarkIssue() + { + var typewriterEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterEnglishResolverSettings + ); + + var textSegment = new TextSegment.Builder().SetText("This\"is an ambiguous quotation mark").Build(); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 4, 5),]) + .Count(), + Is.EqualTo(0) + ); + Assert.That( + typewriterEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.AmbiguousQuotationMark]) + ); + + typewriterEnglishQuotationMarkResolver.Reset(); + textSegment = new TextSegment.Builder().SetText("\u201cThis is an ambiguous quotation mark").Build(); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 0, 1)]) + .Count(), + Is.EqualTo(0) + ); + Assert.That( + typewriterEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.AmbiguousQuotationMark]) + ); + } + + [Test] + public void TypewriterEnglishQuotationMarkRecognition() + { + var typewriterEnglishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterEnglishResolverSettings + ); + + var textSegment = ( + new TextSegment.Builder() + .SetText("\"This is a 'quote'\"") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(typewriterEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void TypewriterFrenchMarkRecognition() + { + var typewriterFrenchQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_french") + ); + Assert.IsNotNull(typewriterFrenchQuoteConvention); + var typewriterFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterFrenchQuoteConvention]) + ); + var typewriterFrenchQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterFrenchResolverSettings + ); + + var textSegment = new TextSegment.Builder().SetText("<>>").Build(); + Assert.That( + typewriterFrenchQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 2), + new QuotationMarkStringMatch(textSegment, 12, 13), + new QuotationMarkStringMatch(textSegment, 18, 19), + new QuotationMarkStringMatch(textSegment, 19, 21), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("<<", 1, QuotationMarkDirection.Opening, textSegment, 0, 2), + new QuotationMarkMetadata("<", 2, QuotationMarkDirection.Opening, textSegment, 12, 13), + new QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, textSegment, 18, 19), + new QuotationMarkMetadata(">>", 1, QuotationMarkDirection.Closing, textSegment, 19, 21), + ] + ) + ); + Assert.That(typewriterFrenchQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void CentralEuropeanQuotationMarkRecognition() + { + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var centralEuropeanQuotationMarkResolver = new DepthBasedQuotationMarkResolver(centralEuropeanResolverSettings); + + var textSegment = ( + new TextSegment.Builder() + .SetText("\u201eThis is a \u201aquote\u2018\u201c") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + centralEuropeanQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u201a", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(centralEuropeanQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void StandardSwedishQuotationMarkRecognition() + { + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardSwedishResolverSettings); + + var textSegment = ( + new TextSegment.Builder() + .SetText("\u201dThis is a \u2019quote\u2019\u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardSwedishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(standardSwedishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void MultipleConventionsQuotationMarkRecognition() + { + var typewriterFrenchQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "typewriter_french" + ); + + Assert.IsNotNull(typewriterFrenchQuoteConvention); + + var centralEuropeanQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + + var standardSwedishQuoteConvention = ( + StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var multipleConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [typewriterFrenchQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var multipleConventionsQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + multipleConventionsResolverSettings + ); + + var textSegment = ( + new TextSegment.Builder() + .SetText("\u201eThis is a \u2019quote>\u201c") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + multipleConventionsQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(multipleConventionsQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + private class TestQuoteContinuerState : QuoteContinuerState + { + public QuoteContinuerStyle InternalContinuerStyle + { + get => ContinuerStyle; + set => ContinuerStyle = value; + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs new file mode 100644 index 00000000..b84ef0f2 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs @@ -0,0 +1,6 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class DepthBasedQuotationMarkResolverTestsTemp { } From 1d49973cdf8fbba99ec65645ae3f026c97794621 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 28 Jul 2025 12:46:29 -0400 Subject: [PATCH 07/28] Passing tests + complete porting --- .../DepthBasedQuotationMarkResolver.cs | 108 +- .../PreliminaryQuotationMarkAnalyzer.cs | 14 +- .../QuotationMarkFinder.cs | 40 +- .../QuotationMarkStringMatch.cs | 62 +- .../QuotationMarkTabulator.cs | 13 +- .../PunctuationAnalysis/QuoteConvention.cs | 36 +- .../PunctuationAnalysis/QuoteConventionSet.cs | 8 +- .../PunctuationAnalysis/TextSegment.cs | 122 +- .../UsfmStructureExtractor.cs | 12 +- .../Corpora/PunctuationAnalysis/Verse.cs | 4 +- ...kDenormalizationUsfmUpdateBlockHandler.cs} | 6 +- .../Corpora/QuotationMarkUpdateFirstPass.cs | 20 +- ...onventionChangingUsfmUpdateBlockHandler.cs | 28 +- src/SIL.Machine/Corpora/UsfmParserState.cs | 2 +- .../PunctuationAnalysis/ChapterTests.cs | 33 + .../PreliminaryQuotationMarkAnalyzerTests.cs | 1174 ++++++++++ .../QuotationMarkFinderTests.cs | 427 ++++ .../QuotationMarkMetadataTests.cs | 81 + .../QuotationMarkResolverTests.cs | 52 + .../QuotationMarkStringMatchTests.cs | 746 +++++++ .../QuotationMarkTabulatorTests.cs | 215 ++ .../QuoteConventionDetectorTests.cs | 372 ++++ .../QuoteConventionSetTests.cs | 1923 +++++++++++++++++ .../QuoteConventionTests.cs | 442 ++++ .../PunctuationAnalysis/TextSegmentTests.cs | 305 +++ .../UsfmStructureExtractorTests.cs | 496 +++++ .../Corpora/PunctuationAnalysis/VerseTests.cs | 57 + .../Corpora/PunctuationAnalysis/temp.cs | 6 - .../Corpora/QuotationDenormalizationTests.cs | 60 + ...ormalizationUsfmBlockUpdateHandlerTests.cs | 495 +++++ .../QuotationMarkUpdateFirstPassTests.cs | 730 +++++++ ...tionChangingUsfmBlockUpdateHandlerTests.cs | 839 +++++++ 32 files changed, 8711 insertions(+), 217 deletions(-) rename src/SIL.Machine/Corpora/{QuotationDenormalizationUsfmUpdateBlockHandler.cs => QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs} (66%) create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkResolverTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs delete mode 100644 tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index 0ea0e807..8401c927 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -78,9 +78,9 @@ public enum QuoteContinuerStyle public class QuoteContinuerState { - private Stack _quoteContinuerMarks; + public Stack QuoteContinuerMarks { get; private set; } public QuoteContinuerStyle ContinuerStyle { get; protected set; } - public int CurrentDepth => _quoteContinuerMarks.Count; + public int CurrentDepth => QuoteContinuerMarks.Count; public QuoteContinuerState() { @@ -89,13 +89,13 @@ public QuoteContinuerState() public void Reset() { - _quoteContinuerMarks = new Stack(); + QuoteContinuerMarks = new Stack(); ContinuerStyle = QuoteContinuerStyle.Undetermined; } public bool ContinuerHasBeenObserved() { - return _quoteContinuerMarks.Count > 0; + return QuoteContinuerMarks.Count > 0; } public QuotationMarkMetadata AddQuoteContinuer( @@ -105,14 +105,14 @@ QuoteContinuerStyle quoteContinuerStyle ) { QuotationMarkMetadata quote = quotationMarkMatch.Resolve( - _quoteContinuerMarks.Count + 1, + QuoteContinuerMarks.Count + 1, QuotationMarkDirection.Opening ); - _quoteContinuerMarks.Push(quote); + QuoteContinuerMarks.Push(quote); ContinuerStyle = quoteContinuerStyle; - if (_quoteContinuerMarks.Count == quotationMarkResolverState.Quotations.Count) + if (QuoteContinuerMarks.Count == quotationMarkResolverState.Quotations.Count) { - _quoteContinuerMarks.Clear(); + QuoteContinuerMarks.Clear(); } return quote; } @@ -381,33 +381,33 @@ public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationM public class DepthBasedQuotationMarkResolver : IQuotationMarkResolver { - private readonly IQuotationMarkResolutionSettings _settings; - private readonly QuotationMarkResolverState _quotationMarkResolverState; - private readonly QuoteContinuerState _quoteContinuerState; - private readonly QuotationMarkCategorizer _quotationMarkCategorizer; - private readonly HashSet _issues; + public readonly IQuotationMarkResolutionSettings Settings; + public readonly QuotationMarkResolverState QuotationMarkResolverState; + public readonly QuoteContinuerState QuoteContinuerState; + public readonly QuotationMarkCategorizer QuotationMarkCategorizer; + protected readonly HashSet Issues; public DepthBasedQuotationMarkResolver(IQuotationMarkResolutionSettings settings) { - _settings = settings; - _quotationMarkResolverState = new QuotationMarkResolverState(); - _quoteContinuerState = new QuoteContinuerState(); - _quotationMarkCategorizer = new QuotationMarkCategorizer( - _settings, - _quotationMarkResolverState, - _quoteContinuerState + Settings = settings; + QuotationMarkResolverState = new QuotationMarkResolverState(); + QuoteContinuerState = new QuoteContinuerState(); + QuotationMarkCategorizer = new QuotationMarkCategorizer( + Settings, + QuotationMarkResolverState, + QuoteContinuerState ); - _issues = new HashSet(); + Issues = new HashSet(); } - public void Reset() + public virtual void Reset() { - _quotationMarkResolverState.Reset(); - _quoteContinuerState.Reset(); - _issues.Clear(); + QuotationMarkResolverState.Reset(); + QuoteContinuerState.Reset(); + Issues.Clear(); } - public IEnumerable ResolveQuotationMarks( + public virtual IEnumerable ResolveQuotationMarks( List quotationMarkMatches ) { @@ -423,8 +423,8 @@ List quotationMarkMatches foreach (QuotationMarkMetadata q in ResolveQuotationMark(quotationMarkMatch, previousMark, nextMark)) yield return q; } - if (_quotationMarkResolverState.HasOpenQuotationMark) - _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + if (QuotationMarkResolverState.HasOpenQuotationMark) + Issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); } public IEnumerable ResolveQuotationMark( @@ -433,9 +433,9 @@ public IEnumerable ResolveQuotationMark( QuotationMarkStringMatch nextMatch ) { - if (_quotationMarkCategorizer.IsOpeningQuotationMark(quotationMarkMatch)) + if (QuotationMarkCategorizer.IsOpeningQuotationMark(quotationMarkMatch)) { - if (_quotationMarkCategorizer.IsEnglishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) + if (QuotationMarkCategorizer.IsEnglishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) { yield return ProcessQuoteContinuer(quotationMarkMatch, QuoteContinuerStyle.English); } @@ -443,23 +443,23 @@ QuotationMarkStringMatch nextMatch { if (IsDepthTooGreat()) { - _issues.Add(QuotationMarkResolutionIssue.TooDeepNesting); + Issues.Add(QuotationMarkResolutionIssue.TooDeepNesting); yield break; } yield return ProcessOpeningMark(quotationMarkMatch); } } - else if (_quotationMarkCategorizer.IsApostrophe(quotationMarkMatch, nextMatch)) { } - else if (_quotationMarkCategorizer.IsClosingQuotationMark(quotationMarkMatch)) + else if (QuotationMarkCategorizer.IsApostrophe(quotationMarkMatch, nextMatch)) { } + else if (QuotationMarkCategorizer.IsClosingQuotationMark(quotationMarkMatch)) { - if (_quotationMarkCategorizer.IsSpanishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) + if (QuotationMarkCategorizer.IsSpanishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) { yield return ProcessQuoteContinuer(quotationMarkMatch, QuoteContinuerStyle.Spanish); } - else if (!_quotationMarkResolverState.HasOpenQuotationMark) + else if (!QuotationMarkResolverState.HasOpenQuotationMark) { - _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); yield break; } else @@ -467,21 +467,21 @@ QuotationMarkStringMatch nextMatch yield return ProcessClosingMark(quotationMarkMatch); } } - else if (_quotationMarkCategorizer.IsMalformedClosingQuotationMark(quotationMarkMatch)) + else if (QuotationMarkCategorizer.IsMalformedClosingQuotationMark(quotationMarkMatch)) { yield return ProcessClosingMark(quotationMarkMatch); } - else if (_quotationMarkCategorizer.IsMalformedOpeningQuotationMark(quotationMarkMatch)) + else if (QuotationMarkCategorizer.IsMalformedOpeningQuotationMark(quotationMarkMatch)) { yield return ProcessOpeningMark(quotationMarkMatch); } - else if (_quotationMarkCategorizer.IsUnpairedClosingQuotationMark(quotationMarkMatch)) + else if (QuotationMarkCategorizer.IsUnpairedClosingQuotationMark(quotationMarkMatch)) { - _issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); } else { - _issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); } } @@ -490,51 +490,51 @@ private QuotationMarkMetadata ProcessQuoteContinuer( QuoteContinuerStyle continuerStyle ) { - return _quoteContinuerState.AddQuoteContinuer( + return QuoteContinuerState.AddQuoteContinuer( quotationMarkMatch, - _quotationMarkResolverState, + QuotationMarkResolverState, continuerStyle ); } private bool IsDepthTooGreat() { - return _quotationMarkResolverState.AreMoreThanNQuotesOpen(3); + return QuotationMarkResolverState.AreMoreThanNQuotesOpen(3); } private QuotationMarkMetadata ProcessOpeningMark(QuotationMarkStringMatch quotationMarkMatch) { if ( - !_settings.MetadataMatchesQuotationMark( + !Settings.MetadataMatchesQuotationMark( quotationMarkMatch.QuotationMark, - _quotationMarkResolverState.CurrentDepth + 1, + QuotationMarkResolverState.CurrentDepth + 1, QuotationMarkDirection.Opening ) ) { - _issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); } - return _quotationMarkResolverState.AddOpeningQuotationMark(quotationMarkMatch); + return QuotationMarkResolverState.AddOpeningQuotationMark(quotationMarkMatch); } private QuotationMarkMetadata ProcessClosingMark(QuotationMarkStringMatch quotationMarkMatch) { if ( - !_settings.MetadataMatchesQuotationMark( + !Settings.MetadataMatchesQuotationMark( quotationMarkMatch.QuotationMark, - _quotationMarkResolverState.CurrentDepth, + QuotationMarkResolverState.CurrentDepth, QuotationMarkDirection.Closing ) ) { - _issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); } - return _quotationMarkResolverState.AddClosingQuotationMark(quotationMarkMatch); + return QuotationMarkResolverState.AddClosingQuotationMark(quotationMarkMatch); } - public HashSet GetIssues() + public virtual HashSet GetIssues() { - return _issues; + return Issues; } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs index 17125f1c..bf3ea5fe 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -36,7 +36,7 @@ public bool IsApostropheProportionGreaterThan(double threshold) { if (_numCharacters == 0) return false; - return (_numApostrophes / _numCharacters) > threshold; + return ((double)_numApostrophes / _numCharacters) > threshold; } } @@ -106,14 +106,14 @@ public bool IsMarkRarelyInitial(string quotationMark) { int numInitialMarks = GetWordInitialOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (numInitialMarks / numTotalMarks) < MaximumProportionForRarity; + return numTotalMarks > 0 && ((double)numInitialMarks / numTotalMarks) < MaximumProportionForRarity; } public bool IsMarkRarelyFinal(string quotationMark) { int numFinalMarks = GetWordFinalOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (numFinalMarks / numTotalMarks) < MaximumProportionForRarity; + return numTotalMarks > 0 && ((double)numFinalMarks / numTotalMarks) < MaximumProportionForRarity; } public bool AreInitialAndFinalRatesSimilar(string quotationMark) @@ -122,14 +122,16 @@ public bool AreInitialAndFinalRatesSimilar(string quotationMark) int numFinalMarks = GetWordFinalOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); return numTotalMarks > 0 - && (Math.Abs(numInitialMarks - numFinalMarks) / numTotalMarks) < MaximumProportionDifferenceThreshold; + && ((double)Math.Abs(numInitialMarks - numFinalMarks) / numTotalMarks) + < MaximumProportionDifferenceThreshold; } public bool IsMarkCommonlyMidWord(string quotationMark) { int numMidWordMarks = GetMidWordOccurrences(quotationMark); int numTotalMarks = GetTotalOccurrences(quotationMark); - return numTotalMarks > 0 && (numMidWordMarks / numTotalMarks) > MaximumProportionDifferenceThreshold; + return numTotalMarks > 0 + && ((double)numMidWordMarks / numTotalMarks) > MaximumProportionDifferenceThreshold; } } @@ -194,7 +196,7 @@ public bool AreEarlyAndLateMarkRatesSimilar(string quotationMark) int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); int numLateOccurrences = GetLaterOccurrences(quotationMark); return numEarlyOccurrences > 0 - && (Math.Abs(numLateOccurrences - numEarlyOccurrences) / numEarlyOccurrences) + && ((double)Math.Abs(numLateOccurrences - numEarlyOccurrences) / numEarlyOccurrences) < MaximumProportionDifferenceThreshold; } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs index 6dc80ef0..e4ddd74b 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Globalization; using System.Linq; using System.Text.RegularExpressions; @@ -6,15 +7,12 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkFinder { - private static readonly Regex QuotationMarkPattern = new Regex( - @"(\p{Pi}|\p{Pf}|<<|>>|<|>)", - RegexOptions.Compiled - ); + private static readonly Regex TypewriterGuillemetsPattern = new Regex(@"(<<|>>|<|>)", RegexOptions.Compiled); private readonly QuoteConventionSet _quoteConventions; - public QuotationMarkFinder(QuoteConventionSet quoteConventionSet) + public QuotationMarkFinder(QuoteConventionSet quoteConventions) { - _quoteConventions = quoteConventionSet; + _quoteConventions = quoteConventions; } public List FindAllPotentialQuotationMarksInChapter(Chapter chapter) @@ -30,7 +28,7 @@ public List FindAllPotentialQuotationMarksInVerse(Vers return FindAllPotentialQuotationMarksInTextSegments(verse.TextSegments); } - public List FindAllPotentialQuotationMarksInTextSegments( + public virtual List FindAllPotentialQuotationMarksInTextSegments( List textSegments ) { @@ -39,7 +37,28 @@ List textSegments public List FindAllPotentialQuotationMarksInTextSegment(TextSegment textSegment) { - return QuotationMarkPattern + TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(textSegment.Text); + int index = 0; + List quotationMarkStringMatches = new List(); + while (charactersEnumerator.MoveNext()) + { + string currentCharacterString = charactersEnumerator.Current.ToString(); + if ( + ( + QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "QUOTATION MARK") + || QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "APOSTROPHE") + ) + && ( + _quoteConventions.IsValidOpeningQuotationMark(charactersEnumerator.Current.ToString()) + || _quoteConventions.IsValidClosingQuotationMark(charactersEnumerator.Current.ToString()) + ) + ) + { + quotationMarkStringMatches.Add(new QuotationMarkStringMatch(textSegment, index, index + 1)); + } + index++; + } + List typewriterGuillemetMatches = TypewriterGuillemetsPattern .Matches(textSegment.Text) .Cast() .Where(match => @@ -52,6 +71,11 @@ public List FindAllPotentialQuotationMarksInTextSegmen m.Groups[0].Index + m.Groups[0].Length )) .ToList(); + + return quotationMarkStringMatches + .Concat(typewriterGuillemetMatches) + .OrderBy(match => match.StartIndex) + .ToList(); } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs index 90fe296e..117c1f01 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -7,9 +7,7 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkStringMatch { - private static readonly Regex LetterPattern = new Regex(@"[\p{L}\uD838[\uDE00-\uDE8F]]", RegexOptions.Compiled); - - // No LatinLetterPattern because C# does not support it. Using UnicodeInfo to mirror machine.py + // No LatinLetterPattern or LetternPattern because C# does not support it in the same way as Python. Using UnicodeInfo to mirror machine.py private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); @@ -25,6 +23,24 @@ public QuotationMarkStringMatch(TextSegment textSegment, int startIndex, int end EndIndex = endIndex; } + public override bool Equals(object obj) + { + if (!(obj is QuotationMarkStringMatch other)) + return false; + return TextSegment.Equals(other.TextSegment) + && StartIndex == other.StartIndex + && EndIndex == other.EndIndex; + } + + public override int GetHashCode() + { + int code = 23; + code = code * 31 + TextSegment.GetHashCode(); + code = code * 31 + StartIndex.GetHashCode(); + code = code * 31 + EndIndex.GetHashCode(); + return code; + } + public string QuotationMark => new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex); @@ -135,12 +151,31 @@ public bool HasTrailingPunctuation() public bool HasLetterInLeadingSubstring() { - return LeadingSubstringMatches(LetterPattern); + string leadingSubstring = TextSegment.SubstringBefore(StartIndex); + if (leadingSubstring.Length == 0) + return false; + + TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(leadingSubstring); + while (charactersEnumerator.MoveNext()) + { + if (!IsLetter(charactersEnumerator.Current.ToString())) + return false; + } + return true; } public bool HasLetterInTrailingSubstring() { - return TrailingSubstringMatches(LetterPattern); + string trailingSubstring = TextSegment.SubstringAfter(EndIndex); + if (trailingSubstring.Length == 0) + return false; + TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(trailingSubstring); + while (charactersEnumerator.MoveNext()) + { + if (!IsLetter(charactersEnumerator.Current.ToString())) + return false; + } + return true; } public bool HasLeadingLatinLetter() @@ -158,21 +193,30 @@ public bool HasQuoteIntroducerInLeadingSubstring() return LeadingSubstringMatches(QuoteIntroducerPattern); } - private bool IsLatinScript(string characterString) + public static bool HasUnicodeProperty(string characterString, string attribute) { - string latinScriptAttribute = "LATIN"; if (characterString.Length == 1) { - return UnicodeInfo.GetName(characterString[0]).Contains(latinScriptAttribute); + return UnicodeInfo.GetName(characterString[0]).Contains(attribute); } else if (char.IsSurrogatePair(characterString[0], characterString[1])) { //Get true unicode value int combinedCharacterValue = (((int)characterString[0] - 0xD800) * 0x400) + ((int)characterString[1] - 0xDC00) + 0x10000; - return UnicodeInfo.GetName(combinedCharacterValue).Contains(latinScriptAttribute); + return UnicodeInfo.GetName(combinedCharacterValue).Contains(attribute); } return false; } + + private bool IsLatinScript(string characterString) + { + return HasUnicodeProperty(characterString, "LATIN"); + } + + private bool IsLetter(string characterString) + { + return HasUnicodeProperty(characterString, "LETTER"); + } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs index 68e63912..dc059bb3 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -76,25 +76,24 @@ private void CountQuotationMark(QuotationMarkMetadata quote) public double CalculateSimilarity(QuoteConvention quoteConvention) { - double numDifferences = 0.0; - double numTotalQuotationMarks = 0.0; + double weightedDifference = 0.0; + double totalWeight = 0.0; foreach ((int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys) { string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction); // give higher weight to shallower depths, since deeper marks are more likely to be mistakes - numDifferences += ( + weightedDifference += ( _quotationCountsByDepthAndDirection[(depth, direction)] .CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth) ); - numTotalQuotationMarks += - _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth); + totalWeight += _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth); } - if (numTotalQuotationMarks == 0.0) + if (totalWeight == 0.0) { return 0.0; } - return 1 - (numDifferences / numTotalQuotationMarks); + return 1 - (weightedDifference / totalWeight); } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs index a5d6c640..e31520e3 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs @@ -7,7 +7,7 @@ public class SingleLevelQuoteConvention { public static readonly IReadOnlyDictionary QuoteNormalizationMap = new Dictionary() { - { "\u00ab", '\'' }, + { "\u00ab", '\"' }, { "\u00bb", '"' }, { "\u2018", '\'' }, { "\u2019", '\'' }, @@ -48,24 +48,24 @@ public class QuoteConvention { public string Name { get; } - public IReadOnlyList Levels { get; } + public IReadOnlyList LevelConventions { get; } public QuoteConvention(string name, List levels) { Name = name; - Levels = levels; + LevelConventions = levels; } - public int NumLevels => Levels.Count; + public int NumLevels => LevelConventions.Count; - public string GetOpeningQuotationMarkAtLevel(int level) + public string GetOpeningQuotationMarkAtDepth(int depth) { - return Levels[level - 1].OpeningQuotationMark; + return LevelConventions[depth - 1].OpeningQuotationMark; } - public string GetClosingQuotationMarkAtLevel(int level) + public string GetClosingQuotationMarkAtDepth(int depth) { - return Levels[level - 1].ClosingQuotationMark; + return LevelConventions[depth - 1].ClosingQuotationMark; } public string GetExpectedQuotationMark(int depth, QuotationMarkDirection direction) @@ -73,13 +73,13 @@ public string GetExpectedQuotationMark(int depth, QuotationMarkDirection directi if (depth > NumLevels || depth < 1) return ""; return direction == QuotationMarkDirection.Opening - ? GetOpeningQuotationMarkAtLevel(depth) - : GetClosingQuotationMarkAtLevel(depth); + ? GetOpeningQuotationMarkAtDepth(depth) + : GetClosingQuotationMarkAtDepth(depth); } - private bool IncludesOpeningQuotationMark(string openingQuotationMark) + public bool IncludesOpeningQuotationMark(string openingQuotationMark) { - foreach (SingleLevelQuoteConvention level in Levels) + foreach (SingleLevelQuoteConvention level in LevelConventions) { if (level.OpeningQuotationMark == openingQuotationMark) return true; @@ -87,9 +87,9 @@ private bool IncludesOpeningQuotationMark(string openingQuotationMark) return false; } - private bool IncludesClosingQuotationMark(string closingQuotationMark) + public bool IncludesClosingQuotationMark(string closingQuotationMark) { - foreach (SingleLevelQuoteConvention level in Levels) + foreach (SingleLevelQuoteConvention level in LevelConventions) { if (level.ClosingQuotationMark == closingQuotationMark) return true; @@ -100,7 +100,7 @@ private bool IncludesClosingQuotationMark(string closingQuotationMark) public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) { var depths = new HashSet(); - foreach ((int depth, SingleLevelQuoteConvention level) in Levels.Select((l, i) => (i + 1, l))) + foreach ((int depth, SingleLevelQuoteConvention level) in LevelConventions.Select((l, i) => (i + 1, l))) { if (direction == QuotationMarkDirection.Opening && level.OpeningQuotationMark == quotationMark) depths.Add(depth); @@ -127,16 +127,16 @@ List closingQuotationMarks } // we require the first-level quotes to have been observed - if (!openingQuotationMarks.Contains(GetOpeningQuotationMarkAtLevel(1))) + if (!openingQuotationMarks.Contains(GetOpeningQuotationMarkAtDepth(1))) return false; - if (!closingQuotationMarks.Contains(GetClosingQuotationMarkAtLevel(1))) + if (!closingQuotationMarks.Contains(GetClosingQuotationMarkAtDepth(1))) return false; return true; } public QuoteConvention Normalize() { - return new QuoteConvention(Name + "_normalized", Levels.Select(l => l.Normalize()).ToList()); + return new QuoteConvention(Name + "_normalized", LevelConventions.Select(l => l.Normalize()).ToList()); } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs index 922ec873..ebec0b34 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs @@ -50,8 +50,8 @@ private void CreateQuotationMarkRegexes() { for (int level = 1; level < convention.NumLevels + 1; level++) { - string openingQuote = convention.GetOpeningQuotationMarkAtLevel(level); - string closingQuote = convention.GetClosingQuotationMarkAtLevel(level); + string openingQuote = convention.GetOpeningQuotationMarkAtDepth(level); + string closingQuote = convention.GetClosingQuotationMarkAtDepth(level); openingQuotationMarks.Add(openingQuote); closingQuotationMarks.Add(closingQuote); } @@ -84,8 +84,8 @@ private void CreateQuotationMarkPairMap() { for (int level = 1; level < convention.NumLevels + 1; level++) { - string openingQuote = convention.GetOpeningQuotationMarkAtLevel(level); - string closingQuote = convention.GetClosingQuotationMarkAtLevel(level); + string openingQuote = convention.GetOpeningQuotationMarkAtDepth(level); + string closingQuote = convention.GetClosingQuotationMarkAtDepth(level); closingMarksByOpeningMark.UpdateValue( openingQuote, () => new HashSet(), diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs index 10a0ad46..be35bca6 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs @@ -4,25 +4,37 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis { public class TextSegment { - private string _text; - private UsfmMarkerType _immediatePrecedingMarker; - private readonly HashSet _markersInPrecedingContext; - private TextSegment _previousSegment; - private TextSegment _nextSegment; - private int _indexInVerse; - private int _numSegmentsInVerse; - private UsfmToken _usfmToken; + public string Text { get; private set; } + public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } + public HashSet MarkersInPrecedingContext { get; private set; } + public TextSegment PreviousSegment { get; set; } + public TextSegment NextSegment { get; set; } + public int IndexInVerse { get; set; } + public int NumSegmentsInVerse { get; set; } + public UsfmToken UsfmToken { get; private set; } public TextSegment() { - _text = ""; - _immediatePrecedingMarker = UsfmMarkerType.NoMarker; - _markersInPrecedingContext = new HashSet(); - _previousSegment = null; - _nextSegment = null; - _indexInVerse = 0; - _numSegmentsInVerse = 0; - _usfmToken = null; + Text = ""; + ImmediatePrecedingMarker = UsfmMarkerType.NoMarker; + MarkersInPrecedingContext = new HashSet(); + PreviousSegment = null; + NextSegment = null; + IndexInVerse = 0; + NumSegmentsInVerse = 0; + UsfmToken = null; + } + + public TextSegment(string text) + { + Text = text; + ImmediatePrecedingMarker = UsfmMarkerType.NoMarker; + MarkersInPrecedingContext = new HashSet(); + PreviousSegment = null; + NextSegment = null; + IndexInVerse = 0; + NumSegmentsInVerse = 0; + UsfmToken = null; } public override bool Equals(object obj) @@ -31,90 +43,62 @@ public override bool Equals(object obj) { return false; } - return _text.Equals(t._text) - && _indexInVerse.Equals(t._indexInVerse) - && _numSegmentsInVerse.Equals(t._numSegmentsInVerse) + return Text.Equals(t.Text) + && IndexInVerse.Equals(t.IndexInVerse) + && NumSegmentsInVerse.Equals(t.NumSegmentsInVerse) && ( - (_usfmToken == null && t._usfmToken == null) - || (_usfmToken != null && t._usfmToken != null && _usfmToken.Equals(t._usfmToken)) + (UsfmToken == null && t.UsfmToken == null) + || (UsfmToken != null && t.UsfmToken != null && UsfmToken.Equals(t.UsfmToken)) ) - && _immediatePrecedingMarker.Equals(t._immediatePrecedingMarker); + && ImmediatePrecedingMarker.Equals(t.ImmediatePrecedingMarker); } public override int GetHashCode() { int hashCode = 23; - hashCode = hashCode * 31 + _text.GetHashCode(); - hashCode = hashCode * 31 + _indexInVerse.GetHashCode(); - hashCode = hashCode * 31 + _numSegmentsInVerse.GetHashCode(); - hashCode = hashCode * 31 + _usfmToken.GetHashCode(); - return hashCode * 31 + _immediatePrecedingMarker.GetHashCode(); + hashCode = hashCode * 31 + Text.GetHashCode(); + hashCode = hashCode * 31 + IndexInVerse.GetHashCode(); + hashCode = hashCode * 31 + NumSegmentsInVerse.GetHashCode(); + hashCode = hashCode * 31 + UsfmToken.GetHashCode(); + return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode(); } - public string Text => _text; - - public TextSegment PreviousSegment => _previousSegment; - - public TextSegment NextSegment => _nextSegment; - - public int IndexInVerse => _indexInVerse; - - public int Length => _text.Length; + public int Length => Text.Length; public string SubstringBefore(int index) { - return _text.Substring(0, index); + return Text.Substring(0, index); } public string SubstringAfter(int index) { - return _text.Substring(index); + return Text.Substring(index); } public bool MarkerIsInPrecedingContext(UsfmMarkerType marker) { - return _markersInPrecedingContext.Contains(marker); + return MarkersInPrecedingContext.Contains(marker); } public bool IsFirstSegmentInVerse() { - return _indexInVerse == 0; + return IndexInVerse == 0; } public bool IsLastSegmentInVerse() { - return _indexInVerse == _numSegmentsInVerse - 1; + return IndexInVerse == NumSegmentsInVerse - 1; } public void ReplaceSubstring(int startIndex, int endIndex, string replacement) { - _text = SubstringBefore(startIndex) + replacement + SubstringAfter(endIndex); - if (_usfmToken != null) + Text = SubstringBefore(startIndex) + replacement + SubstringAfter(endIndex); + if (UsfmToken != null) { - _usfmToken.Text = _text; + UsfmToken.Text = Text; } } - public void SetPreviousSegment(TextSegment previousSegment) - { - _previousSegment = previousSegment; - } - - public void SetNextSegment(TextSegment nextSegment) - { - _nextSegment = nextSegment; - } - - public void SetIndexInVerse(int indexInVerse) - { - _indexInVerse = indexInVerse; - } - - public void SetNumSegmentsInVerse(int numSegmentsInVerse) - { - _numSegmentsInVerse = numSegmentsInVerse; - } - public class Builder { private readonly TextSegment _textSegment; @@ -126,26 +110,26 @@ public Builder() public Builder SetPreviousSegment(TextSegment previousSegment) { - _textSegment._previousSegment = previousSegment; + _textSegment.PreviousSegment = previousSegment; return this; } public Builder AddPrecedingMarker(UsfmMarkerType marker) { - _textSegment._immediatePrecedingMarker = marker; - _textSegment._markersInPrecedingContext.Add(marker); + _textSegment.ImmediatePrecedingMarker = marker; + _textSegment.MarkersInPrecedingContext.Add(marker); return this; } public Builder SetUsfmToken(UsfmToken token) { - _textSegment._usfmToken = token; + _textSegment.UsfmToken = token; return this; } public Builder SetText(string text) { - _textSegment._text = text; + _textSegment.Text = text; return this; } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs index 2a76664f..5e743dc7 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -15,7 +15,7 @@ public UsfmStructureExtractor() public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber) { - _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); } public void EndBook(UsfmParserState state, string marker) { } @@ -111,8 +111,8 @@ public void Text(UsfmParserState state, string text) // and offline whole-book-at-once settings (QuoteConventionDetector) if (_textSegments.Count > 0 && !textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) { - _textSegments[_textSegments.Count - 1].SetNextSegment(textSegment); - textSegment.SetPreviousSegment(_textSegments[_textSegments.Count - 1]); + _textSegments[_textSegments.Count - 1].NextSegment = textSegment; + textSegment.PreviousSegment = _textSegments[_textSegments.Count - 1]; } _textSegments.Add(textSegment); } @@ -139,7 +139,7 @@ public List GetChapters() { currentChapterVerses.Add(new Verse(currentVerseSegments)); } - currentVerseSegments.Clear(); + currentVerseSegments = new List(); } if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)) { @@ -147,9 +147,9 @@ public List GetChapters() { chapters.Add(new Chapter(currentChapterVerses)); } - currentChapterVerses.Clear(); + currentChapterVerses = new List(); } - currentChapterVerses.Clear(); + currentVerseSegments.Add(textSegment); } if (currentVerseSegments.Count > 0) { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs index 86730b6c..40f63776 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs @@ -17,8 +17,8 @@ private void IndexTextSegments() { foreach ((int index, TextSegment textSegment) in TextSegments.Select((t, i) => (i, t))) { - textSegment.SetIndexInVerse(index); - textSegment.SetNumSegmentsInVerse(TextSegments.Count); + textSegment.IndexInVerse = index; + textSegment.NumSegmentsInVerse = TextSegments.Count; } } } diff --git a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs similarity index 66% rename from src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs rename to src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs index 31b4b95d..52bd8abb 100644 --- a/src/SIL.Machine/Corpora/QuotationDenormalizationUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs @@ -2,10 +2,10 @@ namespace SIL.Machine.Corpora { - public class QuotationDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler + public class QuotationMarkDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler { // This is a convenience class so that users don't have to know to normalize the source quote convention - public QuotationDenormalizationUsfmUpdateBlockHandler( + public QuotationMarkDenormalizationUsfmUpdateBlockHandler( QuoteConvention sourceQuoteConvention, QuoteConvention targetQuoteConvention, QuotationMarkUpdateSettings settings = null @@ -13,7 +13,7 @@ public QuotationDenormalizationUsfmUpdateBlockHandler( : base( sourceQuoteConvention.Normalize(), targetQuoteConvention, - settings == null ? new QuotationMarkUpdateSettings() : null + settings ?? new QuotationMarkUpdateSettings() ) { } } } diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index 96482599..b7084a6b 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -11,7 +11,7 @@ public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor private readonly QuoteConvention _targetQuoteConvention; private readonly QuotationMarkFinder _quotationMarkFinder; private readonly DepthBasedQuotationMarkResolver _quotationMarkResolver; - private readonly bool _willFallbackModeWork; + public bool WillFallbackModeWork; public QuotationMarkUpdateFirstPass( QuoteConvention sourceQuoteConvention, @@ -26,26 +26,26 @@ QuoteConvention targetQuoteConvention _quotationMarkResolver = new DepthBasedQuotationMarkResolver( new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention, targetQuoteConvention) ); - _willFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention); + WillFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention); } - private bool CheckWhetherFallbackModeWillWork( + public bool CheckWhetherFallbackModeWillWork( QuoteConvention sourceQuoteConvention, QuoteConvention targetQuoteConvention ) { var targetMarksBySourceMarks = new Dictionary>(); - foreach (int level in Enumerable.Range(1, sourceQuoteConvention.NumLevels)) + foreach (int depth in Enumerable.Range(1, sourceQuoteConvention.NumLevels)) { - string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtLevel(level); + string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtDepth(depth); if (!targetMarksBySourceMarks.TryGetValue(openingQuotationMark, out HashSet marks)) { marks = new HashSet(); targetMarksBySourceMarks[openingQuotationMark] = marks; } - if (level <= targetQuoteConvention.NumLevels) + if (depth <= targetQuoteConvention.NumLevels) { - marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtLevel(level)); + marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth)); } } @@ -62,7 +62,7 @@ public List FindBestChapterStrategies() return bestActionsByChapter; } - private QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) + public QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) { List quotationMarkMatches = _quotationMarkFinder.FindAllPotentialQuotationMarksInChapter(chapter); @@ -75,7 +75,7 @@ private QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) return ChooseBestStrategyBasedOnObservedIssues(_quotationMarkResolver.GetIssues()); } - private QuotationMarkUpdateStrategy ChooseBestStrategyBasedOnObservedIssues( + public QuotationMarkUpdateStrategy ChooseBestStrategyBasedOnObservedIssues( HashSet issues ) { @@ -87,7 +87,7 @@ HashSet issues || issues.Contains(QuotationMarkResolutionIssue.TooDeepNesting) ) { - if (_willFallbackModeWork) + if (WillFallbackModeWork) return QuotationMarkUpdateStrategy.ApplyFallback; return QuotationMarkUpdateStrategy.Skip; } diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs index ef4599b1..83f89169 100644 --- a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -8,13 +8,13 @@ public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHan private readonly QuoteConvention _sourceQuoteConvention; private readonly QuoteConvention _targetQuoteConvention; private readonly QuotationMarkUpdateSettings _settings; - private readonly QuotationMarkFinder _quotationMarkFinder; - private TextSegment.Builder _nextScriptureTextSegmentBuilder; - private readonly IQuotationMarkResolver _verseTextQuotationMarkResolver; + protected QuotationMarkFinder _quotationMarkFinder; + protected TextSegment.Builder _nextScriptureTextSegmentBuilder; + protected IQuotationMarkResolver _verseTextQuotationMarkResolver; private readonly IQuotationMarkResolver _embedQuotationMarkResolver; private readonly IQuotationMarkResolver _simpleQuotationMarkResolver; - private QuotationMarkUpdateStrategy _currentStrategy; - private int _currentChapterNumber; + protected QuotationMarkUpdateStrategy _currentStrategy; + protected int _currentChapterNumber; private int _currentVerseNumber; public QuoteConventionChangingUsfmUpdateBlockHandler( @@ -87,7 +87,7 @@ private UsfmUpdateBlock ApplyStandardUpdating(UsfmUpdateBlock block) return block; } - private void ProcessScriptureElement( + protected void ProcessScriptureElement( UsfmUpdateBlockElement element, IQuotationMarkResolver quotationMarkResolver ) @@ -105,7 +105,7 @@ QuotationMarkMetadata resolvedQuotationMark in quotationMarkResolver.ResolveQuot } } - private List CreateTextSegments(UsfmUpdateBlockElement element) + protected List CreateTextSegments(UsfmUpdateBlockElement element) { var textSegments = new List(); foreach (UsfmToken token in element.GetTokens()) @@ -134,7 +134,7 @@ private List CreateTextSegments(UsfmUpdateBlockElement element) return SetPreviousAndNextForSegments(textSegments); } - private TextSegment CreateTextSegment(UsfmToken token) + protected TextSegment CreateTextSegment(UsfmToken token) { TextSegment textSegmentToReturn = null; _nextScriptureTextSegmentBuilder.SetUsfmToken(token); @@ -147,19 +147,19 @@ private TextSegment CreateTextSegment(UsfmToken token) return textSegmentToReturn; } - private List SetPreviousAndNextForSegments(List textSegments) + protected List SetPreviousAndNextForSegments(List textSegments) { for (int i = 0; i < textSegments.Count; i++) { if (i > 0) - textSegments[i].SetPreviousSegment(textSegments[i - 1]); + textSegments[i].PreviousSegment = textSegments[i - 1]; if (i < textSegments.Count - 1) - textSegments[i].SetNextSegment(textSegments[i + 1]); + textSegments[i].NextSegment = textSegments[i + 1]; } return textSegments; } - private void CheckForChapterChange(UsfmUpdateBlock block) + protected void CheckForChapterChange(UsfmUpdateBlock block) { foreach (ScriptureRef scriptureRef in block.Refs) { @@ -171,12 +171,12 @@ private void CheckForChapterChange(UsfmUpdateBlock block) } } - private void StartNewChapter(int newChapterNum) + protected void StartNewChapter(int newChapterNum) { _currentStrategy = _settings.GetActionForChapter(newChapterNum); _verseTextQuotationMarkResolver.Reset(); _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); } private void CheckForVerseChange(UsfmUpdateBlock block) diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 1b0952f2..b6784096 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -54,7 +54,7 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn /// /// Current verse reference /// - public VerseRef VerseRef { get; internal set; } + public VerseRef VerseRef { get; protected internal set; } /// /// Offset of start of token in verse diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs new file mode 100644 index 00000000..c7650538 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs @@ -0,0 +1,33 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class ChapterTests +{ + [Test] + public void InitializeVerse() + { + List textSegments1 = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 2").Build(), + new TextSegment.Builder().SetText("Segment 3").Build(), + ]; + var verse1 = new Verse(textSegments1); + + List textSegments2 = + [ + new TextSegment.Builder().SetText("Segment 4").Build(), + new TextSegment.Builder().SetText("Segment 5").Build(), + new TextSegment.Builder().SetText("Segment 6").Build(), + ]; + var verse2 = new Verse(textSegments2); + + var chapter = new Chapter([verse1, verse2]); + + Assert.That(chapter.Verses, Has.Count.EqualTo(2)); + Assert.That(chapter.Verses[0].TextSegments, Is.EqualTo(textSegments1)); + Assert.That(chapter.Verses[1].TextSegments, Is.EqualTo(textSegments2)); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs new file mode 100644 index 00000000..d683399b --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs @@ -0,0 +1,1174 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class PreliminaryQuotationMarkAnalyzerTests +{ + // ApostropheProportionStatistics tests + [Test] + public void ApostropheProportionStatisticsReset() + { + var apostropheProportionStatistics = new ApostropheProportionStatistics(); + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("'").Build()); + apostropheProportionStatistics.AddApostrophe(); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.5)); + + apostropheProportionStatistics.Reset(); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.5)); + } + + [Test] + public void IsApostropheProportionGreaterThan() + { + var apostropheProportionStatistics = new ApostropheProportionStatistics(); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.0)); + + // invalid case where no characters have been counted + apostropheProportionStatistics.AddApostrophe(); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.0)); + + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("a").Build()); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.99)); + + apostropheProportionStatistics.AddApostrophe(); + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("bcd").Build()); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.4)); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.5)); + + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("ef").Build()); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.3)); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.4)); + + // QuotationMarkWordPosition tests + } + + [Test] + public void IsMarkRarelyInitial() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201c"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + } + + [Test] + public void IsMarkRarelyFinal() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201c"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + } + + [Test] + public void AreInitialAndFinalRatesSimilar() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + } + + [Test] + public void IsMarkCommonlyMidWord() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + + quotationMarkWordPositions.CountMidWordApostrophe("'"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + + quotationMarkWordPositions.CountWordInitialApostrophe("'"); + quotationMarkWordPositions.CountWordFinalApostrophe("'"); + quotationMarkWordPositions.CountWordInitialApostrophe("'"); + quotationMarkWordPositions.CountWordFinalApostrophe("'"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + + quotationMarkWordPositions.CountMidWordApostrophe("'"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + } + + [Test] + public void QuotationMarkWordPositionsReset() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + + Assert.IsTrue(quotationMarkWordPositions.IsMarkCommonlyMidWord("\u201d")); + + quotationMarkWordPositions.Reset(); + + Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("\u201d")); + + // QuotationMarkSequence tests + } + + [Test] + public void IsMarkMuchMoreCommonEarlier() + { + var quotationMarkSequences = new QuotationMarkSequences(); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + } + + [Test] + public void IsMarkMuchMoreCommonLater() + { + var quotationMarkSequences = new QuotationMarkSequences(); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + } + + [Test] + public void IsMarkCommonEarlyAndLate() + { + var quotationMarkSequences = new QuotationMarkSequences(); + Assert.IsFalse(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + // QuotationMarkGrouper tests + } + + [Test] + public void GetQuotationMarkPairs() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var quotationMarkGrouper = new QuotationMarkGrouper( + [], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // no paired quotation mark + quotationMarkGrouper = new QuotationMarkGrouper( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1)], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // basic quotation mark pair + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u201c", "\u201d")])); + + // out-of-order quotation mark pair + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d\u201c").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d\u201c").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // multiple unpaired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // paired and unpaired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2018\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2018\u201d").Build(), 1, 2), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2018\u201d").Build(), 2, 3), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u201c", "\u201d")])); + + // ambiguous unpaired quotation mark + quotationMarkGrouper = new QuotationMarkGrouper( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1)], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // paired ambiguous quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\"", "\"")])); + + // multiple paired quotation marks (should be skipped because we don't know how to pair them) + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 1, + 2 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 2, + 3 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 3, + 4 + ), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // multiple different paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 1, + 2 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 2, + 3 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 3, + 4 + ), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That( + quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u201c", "\u201d"), ("\u2018", "\u2019")]) + ); + + // second-level paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u2018", "\u2019")])); + + // quotation marks that don't match the convention set + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + } + + [Test] + public void HasDistinctPairedQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var quotationMarkGrouper = new QuotationMarkGrouper( + [], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("")); + + // basic paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + + // second-level paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u2018")); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u2019")); + + // only one half of the pair observed + quotationMarkGrouper = new QuotationMarkGrouper( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1),], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + + // quotation marks that don't match the convention set + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + + // ambiguous quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\"")); + + // PreliminaryApostropheAnalyzer tests + } + + [Test] + public void ThatTheMarkMustBeAnApostrophe() + { + var preliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + preliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("alternative mid\u2019word apostrophe").Build(), + 15, + 16 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("mid\u2018word quotation mark").Build(), + 3, + 4 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("mid\u201cword quotation mark").Build(), + 3, + 4 + ), + ] + ); + Assert.IsTrue(preliminaryApostropheAnalyzer.IsApostropheOnly("'")); + Assert.IsTrue(preliminaryApostropheAnalyzer.IsApostropheOnly("\u2019")); + Assert.IsFalse(preliminaryApostropheAnalyzer.IsApostropheOnly("\u2018")); + Assert.IsFalse(preliminaryApostropheAnalyzer.IsApostropheOnly("\u201c")); + Assert.IsFalse(preliminaryApostropheAnalyzer.IsApostropheOnly("\u201d")); + } + + [Test] + public void ThatARarelyInitialOrFinalMarkIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + new TextSegment.Builder() + .SetText("Technically Unicode has a separate character for the glottal stop, but it is rarely used") + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + [Test] + public void ThatAMarkWithSimilarFinalAndInitialRatesIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var negativePreliminaryApostropheAnalyzer2 = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer2.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer2.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + [Test] + public void ThatACommonlyMidWordMarkIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + [Test] + public void ThatAFrequentlyOccurringCharacterIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [new TextSegment.Builder().SetText("Very short text").Build(),], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + // PreliminaryQuotationMarkAnalyzer tests + } + + [Test] + public void ThatQuotationMarkSequenceIsUsedToDetermineOpeningAndClosingQuotes() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var preliminaryQuotationAnalyzer = new PreliminaryQuotationMarkAnalyzer( + new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + typewriterEnglishQuoteConvention, + standardFrenchQuoteConvention, + westernEuropeanQuoteConvention, + standardSwedishQuoteConvention, + ] + ) + ); + + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201c quoted English text \u201d final text") + .Build() + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201d quoted Swedish text \u201d final text") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardSwedishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText( + "initial text \u00ab quoted French/Western European text \u00bb final text" + ) + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardFrenchQuoteConvention, westernEuropeanQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \" quoted typewriter English text \" final text") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([typewriterEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201c quoted English text \u201d final text") + .Build(), + new TextSegment.Builder() + .SetText("second level \u2018 English quotes \u2019") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \" quoted typewriter English text \" final text") + .Build(), + new TextSegment.Builder().SetText("second level 'typewriter quotes'").Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([typewriterEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201c quoted English text \u201d final text") + .Build(), + new TextSegment.Builder() + .SetText("the quotes \u201d in this segment \u201c are backwards") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText( + "first-level quotes \u2018 must be observed \u2019 to retain a quote convention" + ) + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([])) + ); + } + + [Test] + public void ThatApostrophesNotConsideredAsQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var preliminaryQuotationAnalyzer = new PreliminaryQuotationMarkAnalyzer( + new QuoteConventionSet([standardEnglishQuoteConvention, typewriterEnglishQuoteConvention,]) + ); + + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("ini'tial 'text \u201c quo'ted English text' \u201d fi'nal text") + .Build() + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) + ); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs new file mode 100644 index 00000000..884eeee7 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -0,0 +1,427 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class QuotationMarkFinderTests +{ + [Test] + public void ThatAllPossibleQuotationMarksAreIdentified() + { + var quotationMarkFinder = new QuotationMarkFinder(StandardQuoteConventions.QuoteConventions); + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("\u201cSample Text\u201d").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201cSample Text\u201d").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201cSample Text\u201d").Build(), + 12, + 13 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("\"Sample Text'").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"Sample Text'").Build(), 0, 1), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\"Sample Text'").Build(), + 12, + 13 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 4, + 5 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 9, + 10 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 27, + 28 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 34, + 35 + ) + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 4, + 5 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 9, + 10 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 26, + 27 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 33, + 34 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("All \"the 'typewriter quotation marks").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("All \"the 'typewriter quotation marks").Build(), + 4, + 5 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("All \"the 'typewriter quotation marks").Build(), + 9, + 10 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder() + .SetText("This has \u201equotes from \u00bbdifferent conventions < quotationMarkStringMatches = + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("Opening “quote").Build(), 8, 9), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("Another opening ‘quote").Build(), 16, 17), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("“‘quote continuer") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ) + ]; + + quotationMarkResolver.ResolveQuotationMarks(quotationMarkStringMatches).ToList(); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations, Has.Count.GreaterThan(0)); + Assert.IsTrue(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth > 0); + + quotationMarkResolver.Reset(); + + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations, Has.Count.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks, Has.Count.EqualTo(0)); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth, Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.CurrentDepth, Is.EqualTo(0)); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs new file mode 100644 index 00000000..60f48199 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -0,0 +1,746 @@ +using System.Text.RegularExpressions; +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class QuotationMarkStringMatchTests +{ + [Test] + public void GetQuotationMark() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("quick brown fox").Build(), + 6, + 7 + ); + Assert.That(quotationMarkStringMatch.QuotationMark, Is.EqualTo("b")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("quick brown fox").Build(), + 6, + 10 + ); + Assert.That(quotationMarkStringMatch.QuotationMark, Is.EqualTo("brow")); + + quotationMarkStringMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("q").Build(), 0, 1); + Assert.That(quotationMarkStringMatch.QuotationMark, Is.EqualTo("q")); + } + + [Test] + public void IsValidOpeningQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standardEnglish", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 0, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + } + + [Test] + public void IsValidClosingQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standardEnglish", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 0, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + } + + [Test] + public void DoesQuotationMarkMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.QuotationMarkMatches(new Regex(@"^s$", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.QuotationMarkMatches(new Regex(@"a", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.QuotationMarkMatches(new Regex(@"sa", RegexOptions.Compiled))); + } + + [Test] + public void DoesNextCharacterMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.NextCharacterMatches(new Regex(@"^s$", RegexOptions.Compiled))); + Assert.IsTrue(quotationMarkStringMatch.NextCharacterMatches(new Regex(@"a", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.NextCharacterMatches(new Regex(@"sa", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.NextCharacterMatches(new Regex(@".*", RegexOptions.Compiled))); + } + + [Test] + public void DoesPreviousCharacterMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@"^s$", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@"a", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@"sa", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@".*", RegexOptions.Compiled))); + } + + [Test] + public void GetPreviousCharacter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("s")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("x")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + ; + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 1, + 2 + ); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("“")); + } + + [Test] + public void GetNextCharacter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.That(quotationMarkStringMatch.NextCharacter, Is.EqualTo("m")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.That(quotationMarkStringMatch.NextCharacter, Is.EqualTo("a")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 0, + 1 + ); + Assert.That(quotationMarkStringMatch.NextCharacter, Is.EqualTo("”")); + } + + [Test] + public void DoesLeadingSubstringMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.LeadingSubstringMatches(new Regex(@"^sampl$", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.LeadingSubstringMatches(new Regex(@".+", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.LeadingSubstringMatches(new Regex(@"\u201c", RegexOptions.Compiled))); + } + + [Test] + public void DoesTrailingSubstringMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@"^ text$", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 11, + 11 + ); //TODO 12 does not exist? + Assert.IsFalse(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@".+", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@"\u201d", RegexOptions.Compiled))); + } + + [Test] + public void GetContext() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("this is a bunch' of sample text").Build(), + 15, + 16 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("is a bunch' of sample")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("this is a bunch' of sample text").Build(), + 5, + 6 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("this is a bunch'")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("this is a bunch' of sample text").Build(), + 25, + 26 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("' of sample text")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("short").Build(), + 3, + 4 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("short")); + } + + [Test] + public void Resolve() + { + var textSegment = new TextSegment.Builder().SetText("'").Build(); + var quotationMarkStringMatch = new QuotationMarkStringMatch(textSegment, 0, 1); + Assert.That( + quotationMarkStringMatch.Resolve(2, QuotationMarkDirection.Opening), + Is.EqualTo(new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, textSegment, 0, 1)) + ); + Assert.That( + quotationMarkStringMatch.Resolve(1, QuotationMarkDirection.Opening), + Is.EqualTo(new QuotationMarkMetadata("'", 1, QuotationMarkDirection.Opening, textSegment, 0, 1)) + ); + Assert.That( + quotationMarkStringMatch.Resolve(1, QuotationMarkDirection.Closing), + Is.EqualTo(new QuotationMarkMetadata("'", 1, QuotationMarkDirection.Closing, textSegment, 0, 1)) + ); + } + + [Test] + public void IsAtStartOfSegment() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtStartOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtStartOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtStartOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 15, + 16 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtStartOfSegment); + } + + [Test] + public void IsAtEndOfSegment() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtEndOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtEndOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text\u201d").Build(), + 12, + 13 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtEndOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 15, + 16 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtEndOfSegment); + } + + [Test] + public void HasLeadingWhitespace() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample\ttext").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Paragraph).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Embed).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Verse).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Chapter).Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Character).Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text").AddPrecedingMarker(UsfmMarkerType.Verse).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + } + + [Test] + public void HasTrailingWhitespace() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample\ttext").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Paragraph).Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Embed).Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Verse).Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + } + + [Test] + public void HasLeadingPunctuation() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample)\u201d text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample) \u201d text").Build(), + 8, + 9 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample,\u201d text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample.\u201d text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingPunctuation()); + } + + [Test] + public void HasTrailingPunctuation() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample \u201c-text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample \u201c text").Build(), + 7, + 8 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text\u201d").Build(), + 11, + 12 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample', text\u201d").Build(), + 6, + 7 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingPunctuation()); + } + + [Test] + public void HasLetterInLeadingSubstring() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("ꮪample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLetterInLeadingSubstring()); + } + + [Test] + public void HasLetterInTrailingSubstring() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInTrailingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample tex𑢼").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInTrailingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLetterInTrailingSubstring()); + } + + [Test] + public void HasLeadingLatinLetter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("5ample text").Build(), + 1, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("Sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingLatinLetter()); + } + + [Test] + public void HasTrailingLatinLetter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample texT").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingLatinLetter()); + } + + [Test] + public void HasQuoteIntroducerInLeadingSubstring() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, \u201ctext").Build(), + 8, + 9 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample,\u201ctext").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample: \u201ctext").Build(), + 8, + 9 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample:\u201ctext").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, \u201ctext").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample,, \u201ctext").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, a \u201ctext").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, text").Build(), + 8, + 9 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs new file mode 100644 index 00000000..74358d23 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs @@ -0,0 +1,215 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class QuotationMarkTabulatorTests +{ + [Test] + public void GetObservedCount() + { + var counts = new QuotationMarkCounts(); + ; + Assert.That(counts.TotalCount, Is.EqualTo(0)); + + counts.CountQuotationMark("\""); + Assert.That(counts.TotalCount, Is.EqualTo(1)); + + counts.CountQuotationMark("\""); + Assert.That(counts.TotalCount, Is.EqualTo(2)); + + counts.CountQuotationMark("'"); + Assert.That(counts.TotalCount, Is.EqualTo(3)); + } + + [Test] + public void GetBestProportion() + { + var counts = new QuotationMarkCounts(); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("'"); + + (string bestStr, int bestCount, int totalCount) = counts.FindBestQuotationMarkProportion(); + Assert.That(bestStr, Is.EqualTo("\"")); + Assert.That(bestCount, Is.EqualTo(2)); + Assert.That(totalCount, Is.EqualTo(3)); + + counts.CountQuotationMark("'"); + counts.CountQuotationMark("'"); + + (bestStr, bestCount, totalCount) = counts.FindBestQuotationMarkProportion(); + Assert.That(bestStr, Is.EqualTo("'")); + Assert.That(bestCount, Is.EqualTo(3)); + Assert.That(totalCount, Is.EqualTo(5)); + } + + [Test] + public void CalculateNumDifferences() + { + var counts = new QuotationMarkCounts(); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("'"); + + Assert.That(counts.CalculateNumDifferences("\""), Is.EqualTo(1)); + Assert.That(counts.CalculateNumDifferences("'"), Is.EqualTo(2)); + Assert.That(counts.CalculateNumDifferences("\u201c"), Is.EqualTo(3)); + + counts.CountQuotationMark("'"); + Assert.That(counts.CalculateNumDifferences("\""), Is.EqualTo(2)); + Assert.That(counts.CalculateNumDifferences("'"), Is.EqualTo(2)); + Assert.That(counts.CalculateNumDifferences("\u201c"), Is.EqualTo(4)); + + // QuotationMarkTabulator tests + } + + [Test] + public void CalculateSimilarity() + { + var singleLevelQuotationMarkTabulator = new QuotationMarkTabulator(); + singleLevelQuotationMarkTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 0, + 1 + ), + ] + ); + + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) + ), + Is.EqualTo(1.0) + ); + ; + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201d", "\u201c")]) + ), + Is.EqualTo(0.0) + ); + ; + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\"")]) + ), + Is.EqualTo(0.5) + ); + ; + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb") + ] + ) + ), + Is.EqualTo(1.0) + ); + + var emptyQuotationMarkTabulator = new QuotationMarkTabulator(); + Assert.That( + emptyQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) + ), + Is.EqualTo(0.0) + ); + var twoLevelQuotationMarkTabulator = new QuotationMarkTabulator(); + twoLevelQuotationMarkTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 2 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 0, + 2 + ), + ] + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) + ), + Is.EqualTo(0.66666666666667).Within(1e-9) + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019") + ] + ) + ), + Is.EqualTo(1.0) + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb") + ] + ) + ), + Is.EqualTo(0.66666666666667).Within(1e-9) + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u2018", "\u2019") + ] + ) + ), + Is.EqualTo(0.33333333333333).Within(1e-9) + ); + // + // + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs new file mode 100644 index 00000000..d54e9412 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs @@ -0,0 +1,372 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class QuoteConventionDetectorTests +{ + // Text comes from the World English Bible, which is in the public domain. + [Test] + public void StandardEnglish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + } + + [Test] + public void TypewriterEnglish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?\"" + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_english")); + } + + [Test] + public void BritishEnglish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_english")); + } + + [Test] + public void BritishTypewriterEnglish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + ""You shall not eat of any tree of the garden""?' + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_typewriter_english")); + } + + [Test] + public void HybridTypewriterEnglish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + 'You shall not eat of any tree of the garden'?” + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_english")); + } + + [Test] + public void StandardFrench() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‹You shall not eat of any tree of the garden›?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); + } + + [Test] + public void TypewriterFrench() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_french")); + } + + // frenchVariant requires a 3rd-level of quotes to differentiate from standardFrench + [Test] + public void WesternEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + “You shall not eat of any tree of the garden”?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("western_european")); + } + + [Test] + public void BritishInspiredWesternEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‘You shall not eat of any tree of the garden’?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_inspired_western_european")); + } + + [Test] + public void TypewriterWesternEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european")); + } + + [Test] + public void TypewriterWesternEuropeanVariant() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ?"" + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european_variant")); + } + + [Test] + public void HybridTypewriterWesternEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ""You shall not eat of any tree of the garden""?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_western_european")); + } + + [Test] + public void HybridBritishTypewriterWesternEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + 'You shall not eat of any tree of the garden'?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_british_typewriter_western_european")); + } + + [Test] + public void CentralEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden‘?“ + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european")); + } + + [Test] + public void CentralEuropeanGuillemets() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ›You shall not eat of any tree of the garden‹?« + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european_guillemets")); + } + + [Test] + public void StandardSwedish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden’?” + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_swedish")); + } + + [Test] + public void StandardFinnish() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ’You shall not eat of any tree of the garden’?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_finnish")); + } + + [Test] + public void EasternEuropean() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden’?” + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("eastern_european")); + } + + [Test] + public void StandardRussian() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + „You shall not eat of any tree of the garden“?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_russian")); + } + + [Test] + public void StandardArabic() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden‘?“ + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_arabic")); + } + + [Test] + public void NonStandardArabic() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ’You shall not eat of any tree of the garden‘?» + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("non-standard_arabic")); + } + + [Test] + public void MismatchedQuotationMarks() + { + var usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + \\v 2 The woman said to the serpent, + “We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ + "; + var analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + } + + public QuoteConventionAnalysis DetectQuotationConvention(string usfm) + { + var quoteConventionDetector = new QuoteConventionDetector(); + UsfmParser.Parse(usfm, quoteConventionDetector); + return quoteConventionDetector.DetectQuotationConvention(); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs new file mode 100644 index 00000000..3206cf3e --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs @@ -0,0 +1,1923 @@ +using System.Diagnostics.CodeAnalysis; +using System.Text.RegularExpressions; +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class QuoteConventionSetTests +{ + [Test] + public void QuoteRegexes() + { + var emptyQuoteConventionSet = new QuoteConventionSet([]); + Assert.That( + emptyQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + emptyQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + emptyQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + + var quoteConventionSetWithEmptyConventions = new QuoteConventionSet( + [new QuoteConvention("empty convention 1", []), new QuoteConvention("empty convention 2", [])] + ); + Assert.That( + quoteConventionSetWithEmptyConventions.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + quoteConventionSetWithEmptyConventions.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + quoteConventionSetWithEmptyConventions.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ) + ] + ); + Assert.That( + standardEnglishQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘“]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + standardEnglishQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[’”]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + standardEnglishQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘’“”]", RegexOptions.Compiled).ToString()) + ); + + var westernEuropeanQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + ] + ); + Assert.That( + westernEuropeanQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘“«]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + westernEuropeanQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[’”»]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + westernEuropeanQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘’“”«»]", RegexOptions.Compiled).ToString()) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + new QuoteConvention( + "typewriter_french", + [ + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + ] + ), + new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ), + ] + ); + Assert.That( + multipleQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘‹“«<<<]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + multipleQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[’›”»>>>]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + multipleQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘’‹›“”«»<<<>>>]", RegexOptions.Compiled).ToString()) + ); + } + + [Test] + public void QuotationMarkPairMap() + { + var emptyQuoteConventionSet = new QuoteConventionSet([]); + Assert.That(emptyQuoteConventionSet.OpeningMarksByClosingMark, Has.Count.EqualTo(0)); + Assert.That(emptyQuoteConventionSet.ClosingMarksByOpeningMark, Has.Count.EqualTo(0)); + + var quoteConventionSetWithEmptyConventions = new QuoteConventionSet( + [new QuoteConvention("empty convention 1", []), new QuoteConvention("empty convention 2", [])] + ); + Assert.That(quoteConventionSetWithEmptyConventions.OpeningMarksByClosingMark, Has.Count.EqualTo(0)); + Assert.That(quoteConventionSetWithEmptyConventions.ClosingMarksByOpeningMark, Has.Count.EqualTo(0)); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ) + ] + ); + Assert.That( + standardEnglishQuoteConventionSet + .OpeningMarksByClosingMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> { { "’", ["‘"] }, { "”", ["“"] } }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + Assert.That( + standardEnglishQuoteConventionSet + .ClosingMarksByOpeningMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> { { "‘", ["’"] }, { "“", ["”"] } }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + + var westernEuropeanQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + ] + ); + Assert.That( + westernEuropeanQuoteConventionSet + .OpeningMarksByClosingMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "’", ["‘"] }, + { "”", ["“"] }, + { "»", ["«"] } + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + Assert.That( + westernEuropeanQuoteConventionSet + .ClosingMarksByOpeningMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "‘", ["’"] }, + { "“", ["”"] }, + { "«", ["»"] } + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ), + new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ), + ] + ); + Assert.That( + multipleQuoteConventionSet + .ClosingMarksByOpeningMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "‘", ["’"] }, + { "“", ["”"] }, + { "„", ["“"] }, + { "‚", ["‘"] }, + { "”", ["”"] }, + { "’", ["’"] }, + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + Assert.That( + multipleQuoteConventionSet + .OpeningMarksByClosingMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "’", ["‘", "’"] }, + { "”", ["“", "”"] }, + { "“", ["„"] }, + { "‘", ["‚"] }, + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + } + + [Test] + public void GetQuoteConventionByName() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ); + + Assert.That( + multipleQuoteConventionSet.GetQuoteConventionByName("standard_english"), + Is.EqualTo(standardEnglishQuoteConvention) + ); + Assert.That( + multipleQuoteConventionSet.GetQuoteConventionByName("central_european"), + Is.EqualTo(centralEuropeanQuoteConvention) + ); + Assert.That( + multipleQuoteConventionSet.GetQuoteConventionByName("standard_swedish"), + Is.EqualTo(standardSwedishQuoteConvention) + ); + Assert.IsNull(multipleQuoteConventionSet.GetQuoteConventionByName("undefined convention")); + } + + [Test] + public void GetAllQuoteConventionNames() + { + Assert.That(new QuoteConventionSet([]).GetAllQuoteConventionNames(), Has.Count.EqualTo(0)); + Assert.That( + new QuoteConventionSet([new QuoteConvention("conv", [])]) + .GetAllQuoteConventionNames() + .SequenceEqual(["conv"]) + ); + Assert.That( + new QuoteConventionSet([new QuoteConvention("conv1", []), new QuoteConvention("conv2", [])]) + .GetAllQuoteConventionNames() + .SequenceEqual(["conv1", "conv2"]) + ); + Assert.That( + new QuoteConventionSet([new QuoteConvention("conv2", []), new QuoteConvention("conv1", [])]) + .GetAllQuoteConventionNames() + .SequenceEqual(["conv1", "conv2"]) + ); + } + + [Test] + public void GetPossibleOpeningQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That(standardEnglishQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["‘", "“"])); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.That(centralEuropeanQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["‚", "„"])); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.That(standardSwedishQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["’", "”"])); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["‘", "’", "‚", "“", "”", "„"]) + ); + } + + [Test] + public void GetPossibleClosingQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That(standardEnglishQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["’", "”"])); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.That(centralEuropeanQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["‘", "“"])); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.That(standardSwedishQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["’", "”"])); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ); + Assert.That(multipleQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["‘", "’", "“", "”"])); + } + + [Test] + public void IsOpeningQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("‘")); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("“")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("‘“")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("‚")); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("‘")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("“")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidOpeningQuotationMark("’")); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidOpeningQuotationMark("”")); + + var standardFrenchQuoteConventionSet = new QuoteConventionSet([standardFrenchQuoteConvention]); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("«")); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("‹")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("»")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("›")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + standardFrenchQuoteConvention, + ] + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleOpeningQuotationMarks() + .SequenceEqual(["‘", "’", "‚", "‹", "“", "”", "„", "«"]) + ); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("‘")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("’")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("‚")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("“")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("”")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("„")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("«")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("‹")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidOpeningQuotationMark("»")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidOpeningQuotationMark("›")); + } + + [Test] + public void IsClosingQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("”")); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("‘")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("“")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("”’")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("‘")); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("“")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("‚")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidClosingQuotationMark("’")); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidClosingQuotationMark("”")); + + var standardFrenchQuoteConventionSet = new QuoteConventionSet([standardFrenchQuoteConvention]); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("»")); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("›")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("«")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("‹")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + standardFrenchQuoteConvention, + ] + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["‘", "’", "›", "“", "”", "»"]) + ); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("‘")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("’")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("“")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("”")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("»")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("›")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidClosingQuotationMark("«")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidClosingQuotationMark("‹")); + } + + [Test] + public void AreMarksAValidPair() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue(standardEnglishQuoteConventionSet.MarksAreAValidPair("“", "”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("”", "“")); + Assert.IsTrue(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("’", "‘")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("", "")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsTrue(centralEuropeanQuoteConventionSet.MarksAreAValidPair("„", "“")); + Assert.IsTrue(centralEuropeanQuoteConventionSet.MarksAreAValidPair("‚", "‘")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("“", "„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("’", "‚")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("‚", "“")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("‚", "’")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.MarksAreAValidPair("”", "”")); + Assert.IsTrue(standardSwedishQuoteConventionSet.MarksAreAValidPair("’", "’")); + Assert.IsFalse(standardSwedishQuoteConventionSet.MarksAreAValidPair("”", "’")); + Assert.IsFalse(standardSwedishQuoteConventionSet.MarksAreAValidPair("’", "”")); + + var standardFrenchQuoteConventionSet = new QuoteConventionSet([standardFrenchQuoteConvention]); + Assert.IsTrue(standardFrenchQuoteConventionSet.MarksAreAValidPair("«", "»")); + Assert.IsTrue(standardFrenchQuoteConventionSet.MarksAreAValidPair("‹", "›")); + Assert.IsFalse(standardFrenchQuoteConventionSet.MarksAreAValidPair("«", "›")); + Assert.IsFalse(standardFrenchQuoteConventionSet.MarksAreAValidPair("‹", "»")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + standardFrenchQuoteConvention, + ] + ); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("“", "”")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("‘", "’")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("„", "“")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("‚", "‘")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("”", "”")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("’", "’")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("«", "»")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("‹", "›")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("‹", "»")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("‹", "”")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("„", "”")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("’", "‘")); + } + + [Test] + public void IsQuotationMarkDirectionAmbiguous() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var easternEuropeanQuoteConvention = new QuoteConvention( + "eastern_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("“")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("\"")); + + var typewriterEnglishQuoteConventionSet = new QuoteConventionSet([typewriterEnglishQuoteConvention]); + Assert.IsTrue(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("\"")); + Assert.IsTrue(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("'")); + Assert.IsFalse(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + Assert.IsFalse(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("«")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("“")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‚")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + + var easternEuropeanQuoteConventionSet = new QuoteConventionSet([easternEuropeanQuoteConvention]); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("„")); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‚")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + typewriterEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + easternEuropeanQuoteConvention, + ] + ); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("\"")); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("'")); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("„")); + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‚")); + + // these are unambiguous because they are never the opening and closing in the same convention + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("“")); + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + } + + [Test] + public void GetPossiblePairedQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var easternEuropeanQuoteConvention = new QuoteConvention( + "eastern_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("“").SequenceEqual(["”"])); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["“"])); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("‘").SequenceEqual(["’"])); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["‘"])); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("„").SequenceEqual(["“"])); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("“").SequenceEqual(["„"])); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("‚").SequenceEqual(["‘"])); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("‘").SequenceEqual(["‚"])); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.That(standardSwedishQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["”"])); + Assert.That(standardSwedishQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["’"])); + + var easternEuropeanQuoteConventionSet = new QuoteConventionSet([easternEuropeanQuoteConvention]); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("„").SequenceEqual(["”"])); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["„"])); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("‚").SequenceEqual(["’"])); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["‚"])); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + easternEuropeanQuoteConvention, + ] + ); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("“").SequenceEqual(["”", "„"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["”", "“", "„"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("‘").SequenceEqual(["’", "‚"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["’", "‘", "‚"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("„").SequenceEqual(["“", "”"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("‚").SequenceEqual(["‘", "’"])); + } + + [Test] + public void GetPossibleDepths() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var britishEnglishQuoteConvention = new QuoteConvention( + "british_english", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + ] + ); + + var normalizedWesternEuropeanQuoteConvention = new QuoteConvention( + "westernEuropeanNormalized", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u201c", QuotationMarkDirection.Opening) + .SequenceEqual([1, 3]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u201d", QuotationMarkDirection.Closing) + .SequenceEqual([1, 3]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u2018", QuotationMarkDirection.Opening) + .SequenceEqual([2, 4]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u2019", QuotationMarkDirection.Closing) + .SequenceEqual([2, 4]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + + var britishEnglishQuoteConventionSet = new QuoteConventionSet([britishEnglishQuoteConvention]); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u2018", QuotationMarkDirection.Opening) + .SequenceEqual([1, 3]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u2019", QuotationMarkDirection.Closing) + .SequenceEqual([1, 3]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u201c", QuotationMarkDirection.Opening) + .SequenceEqual([2, 4]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u201d", QuotationMarkDirection.Closing) + .SequenceEqual([2, 4]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + + var normalizedWesternEuropeanQuoteConventionSet = new QuoteConventionSet( + [normalizedWesternEuropeanQuoteConvention] + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("\"", QuotationMarkDirection.Opening) + .SequenceEqual([1, 2]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("\"", QuotationMarkDirection.Closing) + .SequenceEqual([1, 2]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("'", QuotationMarkDirection.Opening) + .SequenceEqual([3]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("'", QuotationMarkDirection.Closing) + .SequenceEqual([3]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, britishEnglishQuoteConvention, normalizedWesternEuropeanQuoteConvention,] + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u201c", QuotationMarkDirection.Opening) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u201d", QuotationMarkDirection.Closing) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u2018", QuotationMarkDirection.Opening) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u2019", QuotationMarkDirection.Closing) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Opening).SequenceEqual([1, 2]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Closing).SequenceEqual([1, 2]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Opening).SequenceEqual([3]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Closing).SequenceEqual([3]) + ); + } + + [Test] + public void DoesMetadataMatchQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 1, QuotationMarkDirection.Opening) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 4, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 1, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 4, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 3, QuotationMarkDirection.Opening) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 2, QuotationMarkDirection.Opening) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 4, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 3, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 2, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 4, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 4, QuotationMarkDirection.Closing) + ); + } + + [Test] + public void FilterToCompatibleQuoteConventions() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u2018"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201d", "\u2019"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u2018"], ["\u201d"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u2019"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201d"], ["\u201c"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u201d"], ["\u201d"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u201e"], ["\u201d"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.FilterToCompatibleQuoteConventions([], []).GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + standardFrenchQuoteConvention, + westernEuropeanQuoteConvention, + standardSwedishQuoteConvention, + ] + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201d"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_swedish"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201c"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u00ab"], ["\u00bb"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_french", "western_european"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u00ab", "\u2039"], ["\u00bb"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_french"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u00ab"], ["\u00bb", "\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["western_european"]) + ); + Assert.That( + multipleQuoteConventionSet.FilterToCompatibleQuoteConventions([], []).GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + } + + [Test] + public void FindMostSimilarConvention() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var allThreeQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, standardFrenchQuoteConvention, westernEuropeanQuoteConvention,] + ); + var twoFrenchQuoteConventionSet = new QuoteConventionSet( + [westernEuropeanQuoteConvention, standardFrenchQuoteConvention] + ); + + var multipleEnglishQuotesTabulator = new QuotationMarkTabulator(); + multipleEnglishQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(multipleEnglishQuotesTabulator), + Is.EqualTo((standardEnglishQuoteConvention, 1.0)) + ); + + var multipleWesternEuropeanQuotesTabulator = new QuotationMarkTabulator(); + multipleWesternEuropeanQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201c", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u201d", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(multipleWesternEuropeanQuotesTabulator), + Is.EqualTo((westernEuropeanQuoteConvention, 1.0)) + ); + + var multipleFrenchQuotesTabulator = new QuotationMarkTabulator(); + multipleFrenchQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2039", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u203a", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(multipleFrenchQuotesTabulator), + Is.EqualTo((standardFrenchQuoteConvention, 1.0)) + ); + Assert.That( + twoFrenchQuoteConventionSet.FindMostSimilarConvention(multipleFrenchQuotesTabulator), + Is.EqualTo((standardFrenchQuoteConvention, 1.0)) + ); + + var noisyMultipleEnglishQuotesTabulator = new QuotationMarkTabulator(); + noisyMultipleEnglishQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201c", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + var (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention( + noisyMultipleEnglishQuotesTabulator + ); + Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.9).Within(1e-9)); + (convention, similarity) = twoFrenchQuoteConventionSet.FindMostSimilarConvention( + noisyMultipleEnglishQuotesTabulator + ); + Assert.That(convention, Is.EqualTo(westernEuropeanQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.1).Within(1e-9)); + + var noisyMultipleFrenchQuotesTabulator = new QuotationMarkTabulator(); + noisyMultipleFrenchQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2039", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u203a", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u2039", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention( + noisyMultipleFrenchQuotesTabulator + ); + Assert.That(convention, Is.EqualTo(standardFrenchQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.916666666666).Within(1e-9)); + + var tooDeepEnglishQuotesTabulator = new QuotationMarkTabulator(); + tooDeepEnglishQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u201c", + 3, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u2018", + 4, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 15, + 16 + ), + new QuotationMarkMetadata( + "\u201c", + 5, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 17, + 18 + ), + ] + ); + (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention(tooDeepEnglishQuotesTabulator); + Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.967741935483871).Within(1e-9)); + + // in case of ties, the earlier convention in the list should be returned + var unknownQuoteTabulator = new QuotationMarkTabulator(); + unknownQuoteTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201a", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ) + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(unknownQuoteTabulator), + Is.EqualTo((standardEnglishQuoteConvention, 0.0)) + ); + + var singleFrenchOpeningQuoteTabulator = new QuotationMarkTabulator(); + singleFrenchOpeningQuoteTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ) + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(singleFrenchOpeningQuoteTabulator), + Is.EqualTo((standardFrenchQuoteConvention, 1.0)) + ); + Assert.That( + twoFrenchQuoteConventionSet.FindMostSimilarConvention(singleFrenchOpeningQuoteTabulator), + Is.EqualTo((westernEuropeanQuoteConvention, 1.0)) + ); + + // Default values should be returned when the QuoteConventionSet is empty + var singleEnglishOpeningQuoteTabulator = new QuotationMarkTabulator(); + singleEnglishOpeningQuoteTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ) + ] + ); + var emptyQuoteConventionSet = new QuoteConventionSet([]); + Assert.That( + emptyQuoteConventionSet.FindMostSimilarConvention(singleEnglishOpeningQuoteTabulator), + Is.EqualTo(((QuoteConvention?)null, double.MinValue)) + ); + } + + private class QuotationMarkPairMapEqualityComparer : IEqualityComparer>> + { + public bool Equals(KeyValuePair> x, KeyValuePair> y) + { + return x.Key == y.Key && x.Value.Count == y.Value.Count && !x.Value.Except(y.Value).Any(); + } + + public int GetHashCode([DisallowNull] KeyValuePair> obj) + { + return obj.GetHashCode(); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs new file mode 100644 index 00000000..a6c11005 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs @@ -0,0 +1,442 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class QuoteConventionTests +{ + [Test] + public void SingleLevelQuoteConventionNormalize() + { + var englishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201c", "\u201d"); + var normalizedEnglishLevel1QuoteConvention = englishLevel1QuoteConvention.Normalize(); + Assert.That(normalizedEnglishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedEnglishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var englishLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2018", "\u2019"); + var normalizedEnglishLevel2QuoteConvention = englishLevel2QuoteConvention.Normalize(); + Assert.That(normalizedEnglishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedEnglishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var alreadyNormalizedEnglishLevel1QuoteConvention = new SingleLevelQuoteConvention("\"", "\""); + var doublyNormalizedEnglishLevel1QuoteConvention = alreadyNormalizedEnglishLevel1QuoteConvention.Normalize(); + Assert.That(doublyNormalizedEnglishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(doublyNormalizedEnglishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var alreadyNormalizedEnglishLevel2QuoteConvention = new SingleLevelQuoteConvention("'", "'"); + var doublyNormalizedEnglishLevel2QuoteConvention = alreadyNormalizedEnglishLevel2QuoteConvention.Normalize(); + Assert.That(doublyNormalizedEnglishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(doublyNormalizedEnglishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var frenchLevel1QuoteConvention = new SingleLevelQuoteConvention("\u00ab", "\u00bb"); + var normalizedFrenchLevel1QuoteConvention = frenchLevel1QuoteConvention.Normalize(); + Assert.That(normalizedFrenchLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedFrenchLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var frenchLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2039", "\u203a"); + var normalizedFrenchLevel2QuoteConvention = frenchLevel2QuoteConvention.Normalize(); + Assert.That(normalizedFrenchLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("\u2039")); + Assert.That(normalizedFrenchLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("\u203a")); + + var typewriterFrenchLevel1QuoteConvention = new SingleLevelQuoteConvention("<<", ">>"); + var normalizedTypewriterFrenchLevel1QuoteConvention = typewriterFrenchLevel1QuoteConvention.Normalize(); + Assert.That(normalizedTypewriterFrenchLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("<<")); + Assert.That(normalizedTypewriterFrenchLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo(">>")); + + var typewriterFrenchLevel2QuoteConvention = new SingleLevelQuoteConvention("<", ">"); + var normalizedTypewriterFrenchLevel2QuoteConvention = typewriterFrenchLevel2QuoteConvention.Normalize(); + Assert.That(normalizedTypewriterFrenchLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("<")); + Assert.That(normalizedTypewriterFrenchLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo(">")); + + var centralEuropeanLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201e", "\u201c"); + var normalizedCentralEuropeanLevel1QuoteConvention = centralEuropeanLevel1QuoteConvention.Normalize(); + Assert.That(normalizedCentralEuropeanLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedCentralEuropeanLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var centralEuropeanLevel2QuoteConvention = new SingleLevelQuoteConvention("\u201a", "\u2018"); + var normalizedCentralEuropeanLevel2QuoteConvention = centralEuropeanLevel2QuoteConvention.Normalize(); + Assert.That(normalizedCentralEuropeanLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedCentralEuropeanLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var centralEuropeanGuillemetsQuoteConvention = new SingleLevelQuoteConvention("\u00bb", "\u00ab"); + var normalizedCentralEuropeanGuillemetsQuoteConvention = centralEuropeanGuillemetsQuoteConvention.Normalize(); + Assert.That(normalizedCentralEuropeanGuillemetsQuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedCentralEuropeanGuillemetsQuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var swedishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201d", "\u201d"); + var normalizedSwedishLevel1QuoteConvention = swedishLevel1QuoteConvention.Normalize(); + Assert.That(normalizedSwedishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedSwedishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var swedishLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2019", "\u2019"); + var normalizedSwedishLevel2QuoteConvention = swedishLevel2QuoteConvention.Normalize(); + Assert.That(normalizedSwedishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedSwedishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var finnishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u00bb", "\u00bb"); + var normalizedFinnishLevel1QuoteConvention = finnishLevel1QuoteConvention.Normalize(); + Assert.That(normalizedFinnishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedFinnishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var arabicLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201d", "\u201c"); + var normalizedArabicLevel1QuoteConvention = arabicLevel1QuoteConvention.Normalize(); + Assert.That(normalizedArabicLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedArabicLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var arabicLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2019", "\u2018"); + var normalizedArabicLevel2QuoteConvention = arabicLevel2QuoteConvention.Normalize(); + Assert.That(normalizedArabicLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedArabicLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + } + + [Test] + public void GetNumLevels() + { + var emptyQuoteConvention = new QuoteConvention("empty-quote-convention", []); + Assert.That(emptyQuoteConvention.NumLevels, Is.EqualTo(0)); + + var oneLevelQuoteConvention = new QuoteConvention( + "one-level-quote-convention", + [new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.That(oneLevelQuoteConvention.NumLevels, Is.EqualTo(1)); + + var twoLevelQuoteConvention = new QuoteConvention( + "two-level-quote-convention", + [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"),] + ); + Assert.That(twoLevelQuoteConvention.NumLevels, Is.EqualTo(2)); + + var threeLevelQuoteConvention = new QuoteConvention( + "three-level-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201D", "\u201D"), + ] + ); + Assert.That(threeLevelQuoteConvention.NumLevels, Is.EqualTo(3)); + } + + [Test] + public void GetOpeningQuoteAtLevel() + { + var quoteConvention = new QuoteConvention( + "test-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.That(quoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.That(quoteConvention.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("\u2018")); + Assert.That(quoteConvention.GetOpeningQuotationMarkAtDepth(3), Is.EqualTo("\u00ab")); + } + + [Test] + public void GetClosingQuoteAtLevel() + { + var quoteConvention = new QuoteConvention( + "test-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.That(quoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\u201d")); + Assert.That(quoteConvention.GetClosingQuotationMarkAtDepth(2), Is.EqualTo("\u2019")); + Assert.That(quoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("\u00bb")); + } + + [Test] + public void GetExpectedQuotationMark() + { + var quoteConvention = new QuoteConvention( + "test-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.That(quoteConvention.GetExpectedQuotationMark(1, QuotationMarkDirection.Opening), Is.EqualTo("\u201c")); + Assert.That(quoteConvention.GetExpectedQuotationMark(1, QuotationMarkDirection.Closing), Is.EqualTo("\u201d")); + Assert.That(quoteConvention.GetExpectedQuotationMark(2, QuotationMarkDirection.Opening), Is.EqualTo("\u2018")); + Assert.That(quoteConvention.GetExpectedQuotationMark(2, QuotationMarkDirection.Closing), Is.EqualTo("\u2019")); + Assert.That(quoteConvention.GetExpectedQuotationMark(3, QuotationMarkDirection.Opening), Is.EqualTo("\u00ab")); + Assert.That(quoteConvention.GetExpectedQuotationMark(3, QuotationMarkDirection.Closing), Is.EqualTo("\u00bb")); + Assert.That(quoteConvention.GetExpectedQuotationMark(4, QuotationMarkDirection.Opening), Is.EqualTo("")); + Assert.That(quoteConvention.GetExpectedQuotationMark(4, QuotationMarkDirection.Closing), Is.EqualTo("")); + Assert.That(quoteConvention.GetExpectedQuotationMark(0, QuotationMarkDirection.Opening), Is.EqualTo("")); + Assert.That(quoteConvention.GetExpectedQuotationMark(0, QuotationMarkDirection.Closing), Is.EqualTo("")); + } + + [Test] + public void IncludesOpeningQuotationMark() + { + var emptyQuoteConvention = new QuoteConvention("empty quote convention", []); + Assert.IsFalse(emptyQuoteConvention.IncludesOpeningQuotationMark("\u201c")); + + var positiveQuoteConvention1 = new QuoteConvention( + "positive quote convention 1", + [new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention1.IncludesOpeningQuotationMark("\u201c")); + + var negativeQuoteConvention1 = new QuoteConvention( + "negative quote convention 1", + [new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsFalse(negativeQuoteConvention1.IncludesOpeningQuotationMark("\u201c")); + + var negativeQuoteConvention2 = new QuoteConvention( + "negative quote convention 2", + [new SingleLevelQuoteConvention("\u201d", "\u201c")] + ); + Assert.IsFalse(negativeQuoteConvention2.IncludesOpeningQuotationMark("\u201c")); + + var positiveQuoteConvention2 = new QuoteConvention( + "positive quote convention 2", + [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsTrue(positiveQuoteConvention2.IncludesOpeningQuotationMark("\u201c")); + + var positiveQuoteConvention3 = new QuoteConvention( + "positive quote convention 3", + [new SingleLevelQuoteConvention("\u2018", "\u2019"), new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention3.IncludesOpeningQuotationMark("\u201c")); + + var negativeQuoteConvention3 = new QuoteConvention( + "negative quote convention 3", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.IsFalse(negativeQuoteConvention3.IncludesOpeningQuotationMark("\u201c")); + } + + [Test] + public void IncludesClosingQuotationMark() + { + var emptyQuoteConvention = new QuoteConvention("empty quote convention", []); + Assert.IsFalse(emptyQuoteConvention.IncludesClosingQuotationMark("\u201d")); + + var positiveQuoteConvention1 = new QuoteConvention( + "positive quote convention 1", + [new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention1.IncludesClosingQuotationMark("\u201d")); + + var negativeQuoteConvention1 = new QuoteConvention( + "negative quote convention 1", + [new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsFalse(negativeQuoteConvention1.IncludesClosingQuotationMark("\u201d")); + + var negativeQuoteConvention2 = new QuoteConvention( + "negative quote convention 2", + [new SingleLevelQuoteConvention("\u201d", "\u201c")] + ); + Assert.IsFalse(negativeQuoteConvention2.IncludesClosingQuotationMark("\u201d")); + + var positiveQuoteConvention2 = new QuoteConvention( + "positive quote convention 2", + [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsTrue(positiveQuoteConvention2.IncludesClosingQuotationMark("\u201d")); + + var positiveQuoteConvention3 = new QuoteConvention( + "positive quote convention 3", + [new SingleLevelQuoteConvention("\u2018", "\u2019"), new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention3.IncludesClosingQuotationMark("\u201d")); + + var negativeQuoteConvention3 = new QuoteConvention( + "negative quote convention 3", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.IsFalse(negativeQuoteConvention3.IncludesClosingQuotationMark("\u201d")); + } + + [Test] + public void GetPossibleDepths() + { + var quoteConvention = new QuoteConvention( + "test-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + Assert.That(quoteConvention.GetPossibleDepths("\u201c", QuotationMarkDirection.Opening).SequenceEqual([1, 3])); + Assert.That(quoteConvention.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u2018", QuotationMarkDirection.Opening).SequenceEqual([2, 4])); + Assert.That(quoteConvention.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u201d", QuotationMarkDirection.Closing).SequenceEqual([1, 3])); + Assert.That(quoteConvention.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u2019", QuotationMarkDirection.Closing).SequenceEqual([2, 4])); + Assert.That(quoteConvention.GetPossibleDepths("\u00ab", QuotationMarkDirection.Opening), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u00ab", QuotationMarkDirection.Closing), Has.Count.EqualTo(0)); + } + + [Test] + public void IsCompatibleWithObservedQuotationMarks() + { + var quoteConvention = new QuoteConvention( + "test-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.IsTrue( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + ); + Assert.IsTrue( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u00ab"], ["\u201d", "\u00bb"]) + ); + Assert.IsTrue(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c"], ["\u201d", "\u2019"])); + Assert.IsTrue(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c"], ["\u201d"])); + Assert.IsTrue( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u00ab"], ["\u201d", "\u2019"]) + ); + + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201d", "\u2019"], ["\u201c"])); + + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u201e"], ["\u201d"])); + + Assert.IsFalse( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u201d", "\u201f"]) + ); + + // must have observed the first-level quotes + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u2018"], ["\u201d"])); + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u00ab"])); + } + + [Test] + public void Normalize() + { + var emptyQuoteConvention = new QuoteConvention("empty-quote-convention", []); + var normalizedEmptyQuoteConvention = emptyQuoteConvention.Normalize(); + Assert.That(normalizedEmptyQuoteConvention.Name, Is.EqualTo("empty-quote-convention_normalized")); + Assert.That(normalizedEmptyQuoteConvention.NumLevels, Is.EqualTo(0)); + + var standardEnglishQuoteConvention = new QuoteConvention( + "standard-english-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var normalizedStandardEnglishQuoteConvention = standardEnglishQuoteConvention.Normalize(); + Assert.That( + normalizedStandardEnglishQuoteConvention.Name, + Is.EqualTo("standard-english-quote-convention_normalized") + ); + Assert.That(normalizedStandardEnglishQuoteConvention.NumLevels, Is.EqualTo(4)); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("'")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(2), Is.EqualTo("'")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(3), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(4), Is.EqualTo("'")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(4), Is.EqualTo("'")); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "test-quote-convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var normalizedWesternEuropeanQuoteConvention = westernEuropeanQuoteConvention.Normalize(); + Assert.That(normalizedWesternEuropeanQuoteConvention.Name, Is.EqualTo("test-quote-convention_normalized")); + Assert.That(normalizedWesternEuropeanQuoteConvention.NumLevels, Is.EqualTo(3)); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(2), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(3), Is.EqualTo("'")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("'")); + + var hybridBritishTypewriterEnglishQuoteConvention = new QuoteConvention( + "hybrid-british-typewriter-english-quote-convention", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + ] + ); + + var normalizedHybridBritishTypewriterEnglishQuoteConvention = ( + hybridBritishTypewriterEnglishQuoteConvention.Normalize() + ); + Assert.IsTrue( + normalizedHybridBritishTypewriterEnglishQuoteConvention.Name + == "hybrid-british-typewriter-english-quote-convention_normalized" + ); + Assert.That(normalizedHybridBritishTypewriterEnglishQuoteConvention.NumLevels, Is.EqualTo(3)); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(1), + Is.EqualTo("\"") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(1), + Is.EqualTo("\"") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(2), + Is.EqualTo("'") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(2), + Is.EqualTo("'") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(3), + Is.EqualTo("\"") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(3), + Is.EqualTo("\"") + ); + } + + // [Test] + // public void PrintSummary() + // { + // var quoteConvention = new QuoteConvention( + // "test-quote-convention", //TODO why kebab case? + // [ + // new SingleLevelQuoteConvention("\u201c", "\u201D"), + // new SingleLevelQuoteConvention("\u2018", "\u2019"), + // new SingleLevelQuoteConvention("\u201D", "\u201D"), + // ] + // ); + // var expectedSummaryMessage = ( + // "test-quote-convention\n" + // + "\u201CFirst-level quote\u201D\n" + // + "\u2018Second-level quote\u2019\n" + // + "\u201DThird-level quote\u201D\n" + // ); + // Assert.That(quoteConvention.ToString(), Is.EqualTo(expectedSummaryMessage)); + // } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs new file mode 100644 index 00000000..ec76932f --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs @@ -0,0 +1,305 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class TextSegmentTests +{ + [Test] + public void BuilderInitialization() + { + var builder = new TextSegment.Builder(); + var textSegment = builder.Build(); + + Assert.That(textSegment.Text, Is.EqualTo("")); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.NoMarker); + Assert.That(textSegment.MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.That(textSegment.IndexInVerse, Is.EqualTo(0)); + Assert.That(textSegment.NumSegmentsInVerse, Is.EqualTo(0)); + Assert.IsNull(textSegment.UsfmToken); + } + + [Test] + public void BuilderSetText() + { + var builder = new TextSegment.Builder(); + var text = "Example text"; + builder.SetText(text); + + Assert.That(builder.Build().Text, Is.EqualTo(text)); + } + + [Test] + public void BuilderSetPreviousSegment() + { + var builder = new TextSegment.Builder(); + var previousSegment = new TextSegment.Builder().SetText("previous segment text").Build(); + builder.SetPreviousSegment(previousSegment); + var textSegment = builder.Build(); + + Assert.That(textSegment.PreviousSegment, Is.EqualTo(previousSegment)); + Assert.IsNull(textSegment.NextSegment); + Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.NoMarker); + Assert.That(textSegment.MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.That(textSegment.IndexInVerse, Is.EqualTo(0)); + Assert.That(textSegment.NumSegmentsInVerse, Is.EqualTo(0)); + } + + [Test] + public void BuilderAddPrecedingMarker() + { + var builder = new TextSegment.Builder(); + builder.AddPrecedingMarker(UsfmMarkerType.Chapter); + var textSegment = builder.Build(); + + Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.Chapter); + Assert.That(textSegment.MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Chapter])); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + + builder.AddPrecedingMarker(UsfmMarkerType.Verse); + textSegment = builder.Build(); + + Assert.That(textSegment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Verse)); + Assert.That( + textSegment.MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Chapter, UsfmMarkerType.Verse,]) + ); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + } + + [Test] + public void BuilderSetUsfmToken() + { + var builder = new TextSegment.Builder(); + builder.SetUsfmToken(new UsfmToken("USFM token text")); + var textSegment = builder.Build(); + + Assert.IsNotNull(textSegment.UsfmToken); + Assert.That(textSegment.UsfmToken.Type, Is.EqualTo(UsfmTokenType.Text)); + Assert.That(textSegment.UsfmToken.Text, Is.EqualTo("USFM token text")); + Assert.That(textSegment.Text, Is.EqualTo("")); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + } + + [Test] + public void Equals() + { + var basicSegment = new TextSegment.Builder().SetText("text1").Build(); + var sameTextSegment = new TextSegment.Builder().SetText("text1").Build(); + var differentTextSegment = new TextSegment.Builder().SetText("different text").Build(); + + // Assert.That(basicSegment, Is.EqualTo(basicSegment)); //TODO fix + // Assert.That(basicSegment , Is.Not.EqualTo(new UsfmToken("text1"))); //TODO also here + Assert.That(basicSegment, Is.EqualTo(sameTextSegment)); + Assert.That(basicSegment, Is.Not.EqualTo(differentTextSegment)); + + var segmentWithIndex = new TextSegment.Builder().SetText("text1").Build(); + segmentWithIndex.IndexInVerse = 1; + var segmentWithSameIndex = new TextSegment.Builder().SetText("text1").Build(); + segmentWithSameIndex.IndexInVerse = 1; + var segmentWithDifferentIndex = new TextSegment.Builder().SetText("text1").Build(); + segmentWithDifferentIndex.IndexInVerse = 2; + + Assert.That(segmentWithIndex, Is.EqualTo(segmentWithSameIndex)); + Assert.IsTrue(segmentWithIndex != segmentWithDifferentIndex); + Assert.IsTrue(segmentWithIndex != basicSegment); + + var segmentWithPrecedingMarker = ( + new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Verse).Build() + ); + var segmentWithSamePrecedingMarker = ( + new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Verse).Build() + ); + var segmentWithDifferentPrecedingMarker = ( + new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Chapter).Build() + ); + var segmentWithMultiplePrecedingMarkers = ( + new TextSegment.Builder() + .SetText("text1") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ); + + var usfmToken = new UsfmToken("USFM token text"); + var segmentWithUsfmToken = new TextSegment.Builder().SetText("text1").SetUsfmToken(usfmToken).Build(); + var segmentWithSameUsfmToken = new TextSegment.Builder().SetText("text1").SetUsfmToken(usfmToken).Build(); + var segmentWithDifferentUsfmToken = ( + new TextSegment.Builder().SetText("text1").SetUsfmToken(new UsfmToken("Different USFM token text")).Build() + ); + + Assert.That(segmentWithUsfmToken, Is.EqualTo(segmentWithSameUsfmToken)); + Assert.IsTrue(segmentWithUsfmToken != segmentWithDifferentUsfmToken); + Assert.IsTrue(basicSegment != segmentWithUsfmToken); + + // attributes that are not used in equality checks + var segmentWithNumVerses = new TextSegment.Builder().SetText("text1").Build(); + segmentWithNumVerses.NumSegmentsInVerse = 3; + var segmentWithSameNumVerses = new TextSegment.Builder().SetText("text1").Build(); + segmentWithSameNumVerses.NumSegmentsInVerse = 3; + var segmentWithDifferentNumVerses = new TextSegment.Builder().SetText("text1").Build(); + segmentWithDifferentNumVerses.NumSegmentsInVerse = 4; + + Assert.That(segmentWithNumVerses, Is.EqualTo(segmentWithSameNumVerses)); + Assert.IsTrue(segmentWithNumVerses != segmentWithDifferentNumVerses); + Assert.IsTrue(segmentWithNumVerses != basicSegment); + + Assert.That(segmentWithPrecedingMarker, Is.EqualTo(segmentWithSamePrecedingMarker)); + Assert.IsTrue(segmentWithPrecedingMarker != segmentWithDifferentPrecedingMarker); + Assert.That(segmentWithPrecedingMarker, Is.EqualTo(segmentWithMultiplePrecedingMarkers)); + Assert.IsTrue(segmentWithPrecedingMarker != basicSegment); + + var segmentWithPreviousSegment = new TextSegment.Builder().SetText("text1").Build(); + segmentWithPreviousSegment.PreviousSegment = segmentWithNumVerses; + + var segmentWithNextSegment = new TextSegment.Builder().SetText("text1").Build(); + segmentWithNextSegment.NextSegment = segmentWithNumVerses; + + Assert.That(basicSegment, Is.EqualTo(segmentWithPreviousSegment)); + Assert.That(basicSegment, Is.EqualTo(segmentWithNextSegment)); + } + + [Test] + public void GetText() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.Text, Is.EqualTo("example text")); + + textSegment = new TextSegment.Builder().SetText("new example text").Build(); + Assert.That(textSegment.Text, Is.EqualTo("new example text")); + } + + [Test] + public void Length() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.Length, Is.EqualTo("example text".Length)); + + textSegment = new TextSegment.Builder().SetText("new example text").Build(); + Assert.That(textSegment.Length, Is.EqualTo("new example text".Length)); + } + + [Test] + public void SubstringBefore() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.SubstringBefore(7), Is.EqualTo("example")); + Assert.That(textSegment.SubstringBefore(8), Is.EqualTo("example ")); + Assert.That(textSegment.SubstringBefore(0), Is.EqualTo("")); + Assert.That(textSegment.SubstringBefore(12), Is.EqualTo("example text")); + } + + [Test] + public void SubstringAfter() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.SubstringAfter(7), Is.EqualTo(" text")); + Assert.That(textSegment.SubstringAfter(8), Is.EqualTo("text")); + Assert.That(textSegment.SubstringAfter(0), Is.EqualTo("example text")); + Assert.That(textSegment.SubstringAfter(12), Is.EqualTo("")); + Assert.That(textSegment.SubstringAfter(11), Is.EqualTo("t")); + } + + [Test] + public void IsMarkerInPrecedingContext() + { + var noPrecedingMarkerSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + + var onePrecedingMarkerTextSegment = ( + new TextSegment.Builder().SetText("example text").AddPrecedingMarker(UsfmMarkerType.Character).Build() + ); + + Assert.IsTrue(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + Assert.IsFalse(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsFalse(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + + var twoPrecedingMarkersTextSegment = ( + new TextSegment.Builder() + .SetText("example text") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ); + Assert.IsTrue(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + Assert.IsTrue(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsFalse(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + + var threePrecedingMarkersTextSegment = ( + new TextSegment.Builder() + .SetText("example text") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build() + ); + Assert.IsTrue(threePrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + Assert.IsTrue(threePrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsTrue(threePrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + } + + [Test] + public void IsFirstSegmentInVerse() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + textSegment.IndexInVerse = 0; + Assert.IsTrue(textSegment.IsFirstSegmentInVerse()); + + textSegment.IndexInVerse = 1; + Assert.IsFalse(textSegment.IsFirstSegmentInVerse()); + } + + [Test] + public void IsLastSegmentInVerse() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + textSegment.IndexInVerse = 0; + textSegment.NumSegmentsInVerse = 1; + Assert.IsTrue(textSegment.IsLastSegmentInVerse()); + + textSegment.IndexInVerse = 0; + textSegment.NumSegmentsInVerse = 2; + Assert.IsFalse(textSegment.IsLastSegmentInVerse()); + + textSegment.IndexInVerse = 1; + Assert.IsTrue(textSegment.IsLastSegmentInVerse()); + } + + [Test] + public void ReplaceSubstring() + { + var textSegment = new TextSegment.Builder().SetText("example text").Build(); + textSegment.ReplaceSubstring(0, 7, "sample"); + Assert.That(textSegment.Text, Is.EqualTo("sample text")); + + textSegment.ReplaceSubstring(7, 11, "text"); + Assert.That(textSegment.Text, Is.EqualTo("sample text")); + + textSegment.ReplaceSubstring(0, 7, ""); + Assert.That(textSegment.Text, Is.EqualTo("text")); + + textSegment.ReplaceSubstring(0, 4, "new'"); + Assert.That(textSegment.Text, Is.EqualTo("new'")); + + textSegment.ReplaceSubstring(3, 4, "\u2019"); + Assert.That(textSegment.Text, Is.EqualTo("new\u2019")); + + textSegment.ReplaceSubstring(0, 0, "prefix "); + Assert.That(textSegment.Text, Is.EqualTo("prefix new\u2019")); + + textSegment.ReplaceSubstring(0, 0, ""); + Assert.That(textSegment.Text, Is.EqualTo("prefix new\u2019")); + + textSegment.ReplaceSubstring(11, 11, " suffix"); + Assert.That(textSegment.Text, Is.EqualTo("prefix new\u2019 suffix")); + + textSegment.ReplaceSubstring(6, 6, "-"); + Assert.That(textSegment.Text, Is.EqualTo("prefix- new\u2019 suffix")); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs new file mode 100644 index 00000000..042c5799 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs @@ -0,0 +1,496 @@ +using NUnit.Framework; +using SIL.Scripture; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class UsfmStructureExtractorTests +{ + private MockUsfmParserState _verseTextParserState; + + [SetUp] + public void SetUp() + { + _verseTextParserState = new MockUsfmParserState(new UsfmStylesheet("usfm.sty"), ScrVers.English, []); + _verseTextParserState.SetVerseNum(1); + } + + [Test] + public void ChapterAndVerseMarkers() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void StartParagraphMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.StartPara(_verseTextParserState, "p", false, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void StartCharacterMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void EndCharacterMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndChar(_verseTextParserState, "k", null, false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void EndNoteMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "f", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void EndTableMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "tr", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void RefMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "x", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void SidebarMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "esb", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void MultipleVerses() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.Verse(_verseTextParserState, "2", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + new Verse( + [ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + Assert.IsNull(actualChapters[0].Verses[1].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[1].TextSegments[0].NextSegment); + } + + [Test] + public void MultipleChapters() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.Chapter(_verseTextParserState, "2", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + ] + ), + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + ] + ), + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + Assert.IsNull(actualChapters[1].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[1].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void CharacterMarkerInText() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ] + ), + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.That( + actualChapters[0].Verses[0].TextSegments[1].PreviousSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[0]) + ); + Assert.That( + actualChapters[0].Verses[0].TextSegments[0].NextSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[1]) + ); + } + + [Test] + public void EmptyText() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); + usfmStructureExtractor.Text(_verseTextParserState, ""); + usfmStructureExtractor.EndChar(_verseTextParserState, "k", null, false); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ] + ), + ] + ) + ]; + + var actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.That( + actualChapters[0].Verses[0].TextSegments[1].PreviousSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[0]) + ); + Assert.That( + actualChapters[0].Verses[0].TextSegments[0].NextSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[1]) + ); + } + + private static void AssertChapterEqual(List expectedChapters, List actualChapters) + { + Assert.That(expectedChapters.Count, Is.EqualTo(actualChapters.Count)); + foreach ((Chapter expectedChapter, Chapter actualChapter) in expectedChapters.Zip(actualChapters)) + { + Assert.That(expectedChapter.Verses.Count, Is.EqualTo(actualChapter.Verses.Count)); + foreach ((Verse expectedVerse, Verse actualVerse) in expectedChapter.Verses.Zip(actualChapter.Verses)) + { + Assert.That(expectedVerse.TextSegments.Count, Is.EqualTo(actualVerse.TextSegments.Count)); + foreach ( + (TextSegment expectedSegment, TextSegment actualSegment) in expectedVerse.TextSegments.Zip( + actualVerse.TextSegments + ) + ) + { + Assert.That(expectedSegment, Is.EqualTo(actualSegment)); + } + } + } + } + + private class MockUsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOnlyList tokens) + : UsfmParserState(stylesheet, versification, tokens) + { + public void SetVerseNum(int verseNum) + { + var vref = VerseRef; + vref.VerseNum = verseNum; + VerseRef = vref; + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs new file mode 100644 index 00000000..d072fae1 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs @@ -0,0 +1,57 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora.PunctuationAnalysis; + +[TestFixture] +public class VerseTests +{ + [Test] + public void InitializeVerse() + { + List textSegments = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 2").Build(), + new TextSegment.Builder().SetText("Segment 3").Build(), + ]; + + var verse = new Verse(textSegments); + + Assert.That(verse.TextSegments, Has.Count.EqualTo(3)); + Assert.That(verse.TextSegments, Is.EqualTo(textSegments)); + } + + [Test] + public void SegmentIndices() + { + List textSegments = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 1").Build(), + ]; + + var verse = new Verse(textSegments); + + Assert.That(verse.TextSegments[0].IndexInVerse, Is.EqualTo(0)); + Assert.That(verse.TextSegments[1].IndexInVerse, Is.EqualTo(1)); + Assert.That(verse.TextSegments[2].IndexInVerse, Is.EqualTo(2)); + } + + [Test] + public void NumSegmentsInVerse() + { + List textSegments = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 2").Build(), + new TextSegment.Builder().SetText("Segment 3").Build(), + ]; + + var verse = new Verse(textSegments); + + Assert.That(verse.TextSegments[0].NumSegmentsInVerse, Is.EqualTo(3)); + Assert.That(verse.TextSegments[1].NumSegmentsInVerse, Is.EqualTo(3)); + Assert.That(verse.TextSegments[2].NumSegmentsInVerse, Is.EqualTo(3)); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs deleted file mode 100644 index b84ef0f2..00000000 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/temp.cs +++ /dev/null @@ -1,6 +0,0 @@ -using NUnit.Framework; - -namespace SIL.Machine.Corpora.PunctuationAnalysis; - -[TestFixture] -public class DepthBasedQuotationMarkResolverTestsTemp { } diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs new file mode 100644 index 00000000..3c21eae4 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -0,0 +1,60 @@ +using NUnit.Framework; +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuotationDenormalizationTests +{ + [Test] + public void FullQuotationDenormalizationPipeline() + { + var normalizedUsfm = + @" + \id GEN + \c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + \v 2 The woman said to the serpent, + ""We may eat fruit from the trees of the garden, + \v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, 'You shall not eat of it. You shall not touch it, lest you die.'"" + "; + + var expectedDenormalizedUsfm = + @"\id GEN +\c 1 +\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” +\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden, +\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” +"; + + var standardEnglishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + + var quotationMarkDenormalizationFirstPass = new QuotationMarkDenormalizationFirstPass( + standardEnglishQuoteConvention, + standardEnglishQuoteConvention + ); + + UsfmParser.Parse(normalizedUsfm, quotationMarkDenormalizationFirstPass); + var bestChapterStrategies = quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); + + var quotationMarkDenormalizer = new QuotationMarkDenormalizationUsfmUpdateBlockHandler( + standardEnglishQuoteConvention, + standardEnglishQuoteConvention, + new QuotationMarkUpdateSettings(chapterActions: bestChapterStrategies) + ); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer]); + UsfmParser.Parse(normalizedUsfm, updater); + + var actualDenormalizedUsfm = updater.GetUsfm(); + + Assert.That(actualDenormalizedUsfm, Is.EqualTo(expectedDenormalizedUsfm).IgnoreLineEndings()); //TODO use ignore_line_endings + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs new file mode 100644 index 00000000..5bb2d5a2 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs @@ -0,0 +1,495 @@ +using NUnit.Framework; +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuotationMarkDenormalizationUsfmUpdateBlockHandlerTests +{ + private const string SimpleNormalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + + [Test] + public void SimpleEnglishQuoteDenormalization() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleBritishEnglishQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + ""You shall not eat of any tree of the garden""?' + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "british_english", "british_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // no denormalization should be needed for this example + } + + [Test] + public void SimpleTypewriterEnglishQuoteDenormalization() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "typewriter_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // some of the quotes shouldn't need to be denormalized + } + + [Test] + public void SimpleHybridTypewriterEnglishQuoteDenormalization() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "hybrid_typewriter_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // the single guillemets shouldn't need to be denormalized + // because Moses doesn't normalize them + } + + [Test] + public void SimpleFrenchQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ‹You shall not eat of any tree of the garden›?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_french", "standard_french"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // the unusual quotation marks shouldn't need to be denormalized + } + + [Test] + public void SimpleTypewriterFrenchQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_french", "typewriter_french"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // the 1st- and 2nd-level quotes are denormalized to identical marks + } + + [Test] + public void SimpleWesternEuropeanQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "western_european", "western_european"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleTypewriterWesternEuropeanQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <>" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "typewriter_western_european", + "typewriter_western_european" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleTypewriterWesternEuropeanVariantQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, ?\"" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "typewriter_western_european_variant", + "typewriter_western_european_variant" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleHybridTypewriterWesternEuropeanQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, \"You shall not eat of any tree of the garden\"?»" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "hybrid_typewriter_western_european", + "hybrid_typewriter_western_european" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleCentralEuropeanQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european", "central_european"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleCentralEuropeanGuillemetsQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ›You shall not eat of any tree of the garden‹?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "central_european_guillemets", + "central_european_guillemets" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleSwedishQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_swedish", "standard_swedish"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleFinnishQuoteDenormalization() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_finnish"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleEasternEuropeanQuoteDenormalization() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "eastern_european"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleRussianQuoteDenormalization() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_russian", "standard_russian"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleArabicQuoteDenormalization() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" + ); + + var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_arabic"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationSameAsFull() + { + var normalizedUsfm = SimpleNormalizedUsfm; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationIncorrectlyNested() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationIncorrectlyNestedSecondCase() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + ""You shall not eat of any tree of the garden""?' + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationUnclosedQuote() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + public string DenormalizeQuotationMarks( + string normalizedUsfm, + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationDenormalizationSettings = null + ) + { + quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); + QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationDenormalizer = ( + CreateQuotationDenormalizationUsfmUpdateBlockHandler( + sourceQuoteConventionName, + targetQuoteConventionName, + quotationDenormalizationSettings + ) + ); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationDenormalizer]); + UsfmParser.Parse(normalizedUsfm, updater); + + return updater.GetUsfm(); + } + + public QuotationMarkDenormalizationUsfmUpdateBlockHandler CreateQuotationDenormalizationUsfmUpdateBlockHandler( + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationDenormalizationSettings = null + ) + { + quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); + var sourceQuoteConvention = GetQuoteConventionByName(sourceQuoteConventionName); + var targetQuoteConvention = GetQuoteConventionByName(targetQuoteConventionName); + + return new QuotationMarkDenormalizationUsfmUpdateBlockHandler( + sourceQuoteConvention, + targetQuoteConvention, + quotationDenormalizationSettings + ); + } + + public void AssertUsfmEqual(string observedUsfm, string expectedUsfm) + { + foreach ((string observedLine, string expectedLine) in observedUsfm.Split("\n").Zip(expectedUsfm.Split("\n"))) + { + Assert.That(observedLine.Trim(), Is.EqualTo(expectedLine.Trim())); + } + } + + public QuoteConvention GetQuoteConventionByName(string name) + { + QuoteConvention quoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName(name); + Assert.IsNotNull(quoteConvention); + return quoteConvention; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs new file mode 100644 index 00000000..c38d349a --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs @@ -0,0 +1,730 @@ +using NUnit.Framework; +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuotationMarkUpdateFirstPassTests +{ + [Test] + public void CheckWhetherFallbackModeWillWork() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + + // Cases where we expect fallback mode to work + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_english"), + GetQuoteConventionByName("standard_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_french"), + GetQuoteConventionByName("british_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european"), + GetQuoteConventionByName("standard_russian") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european_variant"), + GetQuoteConventionByName("standard_arabic") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european"), + GetQuoteConventionByName("british_typewriter_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_swedish"), + GetQuoteConventionByName("typewriter_french") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_finnish"), + GetQuoteConventionByName("british_inspired_western_european") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("eastern_european"), + GetQuoteConventionByName("central_european") + ) + ); + + // Cases where we expect fallback mode to fail + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_english"), + GetQuoteConventionByName("western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_french"), + GetQuoteConventionByName("western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_french"), + GetQuoteConventionByName("french_variant") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european"), + GetQuoteConventionByName("typewriter_western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("eastern_european"), + GetQuoteConventionByName("standard_russian") + ) + ); + } + + [Test] + public void CheckWhetherFallbackModeWillWorkWithNormalizedConventions() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + + // Cases where we expect fallback mode to work + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_english").Normalize(), + GetQuoteConventionByName("standard_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_french").Normalize(), + GetQuoteConventionByName("british_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european").Normalize(), + GetQuoteConventionByName("standard_russian") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european_variant").Normalize(), + GetQuoteConventionByName("standard_arabic") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european").Normalize(), + GetQuoteConventionByName("british_typewriter_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_swedish").Normalize(), + GetQuoteConventionByName("typewriter_french") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_finnish").Normalize(), + GetQuoteConventionByName("british_inspired_western_european") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("eastern_european").Normalize(), + GetQuoteConventionByName("central_european") + ) + ); + + // Cases where we expect fallback mode to fail + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("western_european").Normalize(), + GetQuoteConventionByName("standard_english") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("french_variant").Normalize(), + GetQuoteConventionByName("hybrid_typewriter_english") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("british_inspired_western_european").Normalize(), + GetQuoteConventionByName("standard_russian") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_english").Normalize(), + GetQuoteConventionByName("western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european_guillemets").Normalize(), + GetQuoteConventionByName("french_variant") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_arabic").Normalize(), + GetQuoteConventionByName("hybrid_typewriter_english") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_russian").Normalize(), + GetQuoteConventionByName("standard_french") + ) + ); + } + + [Test] + public void ChooseBestActionForChapter() + { + // Verse text with no issues + var actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?”" + ], + "standard_english", + "standard_english" + ); + var expectedAction = QuotationMarkUpdateStrategy.ApplyFull; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with unpaired opening quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?" + ], + "standard_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.ApplyFallback; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with unpaired closing quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, Has God really said, " + + "You shall not eat of any tree of the garden?”" + ], + "standard_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.ApplyFallback; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with too deeply nested quotation marks + actualAction = RunFirstPassOnChapter( + [ + "“Now the serpent was more “subtle than any animal " + + "of the “field which “Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "“You shall not eat of any tree of the garden?" + ], + "standard_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.ApplyFallback; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with an ambiguous quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman\"Has God really said, " + + "You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.Skip; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with an ambiguous quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman\"Has God really said, " + + "You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.Skip; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with too deeply nested ambiguous quotation marks + actualAction = RunFirstPassOnChapter( + [ + "\"Now the serpent was more \"subtle than any animal " + + "of the \"field which \"Yahweh God had made. " + + "He said to the woman, \"Has God really said, " + + "\"You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.Skip; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + } + + [Test] + public void ChooseBestActionBasedOnObservedIssues() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + firstPassAnalyzer.WillFallbackModeWork = false; + + // Test with no issue + var bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); + Assert.That(bestAction, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + + // Test with one issue + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.UnpairedQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.AmbiguousQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([QuotationMarkResolutionIssue.TooDeepNesting]), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + + // Test with multiple issues + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.AmbiguousQuotationMark,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [ + QuotationMarkResolutionIssue.UnpairedQuotationMark, + QuotationMarkResolutionIssue.AmbiguousQuotationMark, + ] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.UnpairedQuotationMark,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + } + + [Test] + public void ChooseBestActionBasedOnObservedIssuesWithBasicFallback() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + firstPassAnalyzer.WillFallbackModeWork = true; + + // Test with no issues + var bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); + Assert.That(bestAction, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + + // Test with one issue + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.UnpairedQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.AmbiguousQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([QuotationMarkResolutionIssue.TooDeepNesting]), + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + + // Test with multiple issues + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [ + QuotationMarkResolutionIssue.AmbiguousQuotationMark, + QuotationMarkResolutionIssue.UnpairedQuotationMark, + ] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.AmbiguousQuotationMark, QuotationMarkResolutionIssue.TooDeepNesting,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.UnpairedQuotationMark,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + + // tests of getBestActionsByChapter() + } + + [Test] + public void NoIssuesInUsfm() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFull]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedOpeningMark() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’? + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedClosingMark() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void TooDeepNesting() + { + var normalizedUsfm = + @"\c 1 + \v 1 “Now the serpent was more “subtle than any animal + of the “field which “Yahweh God had made. + He said to the woman, “Has God really said, + “You shall not eat of any tree of the garden? + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMark() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman""Has God really said, + You shall not eat of any tree of the garden? + "; + List expectedActions = [QuotationMarkUpdateStrategy.Skip]; + var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void NoIssuesInMultipleChapters() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \c 2 \v 1 He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.ApplyFull + ]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedQuotationMarkInSecondChapter() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.ApplyFallback + ]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedQuotationMarkInFirstChapter() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had” made. + \c 2 \v 1 He said to the woman, Has God really said, + “You shall not eat of any tree of the garden?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.ApplyFull + ]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMarkInSecondChapter() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not""eat of any tree of the garden?"" + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.Skip + ]; + var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMarkInFirstChapter() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field""which Yahweh God had made. + \c 2 \v 1 He said to the woman, Has God really said, + ""You shall not eat of any tree of the garden?"" + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.ApplyFull + ]; + var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedQuotationMarkInBothChapters() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had” made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.ApplyFallback + ]; + var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMarkInBothChapters() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had""made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any""tree of the garden? + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.Skip + ]; + var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedInFirstAmbiguousInSecond() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made."" + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any""tree of the garden? + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.Skip + ]; + var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousInFirstUnpairedInSecond() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God""had made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden ? "" + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.ApplyFallback + ]; + var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + public List RunFirstPass( + string normalizedUsfm, + string sourceQuoteConventionName, + string targetQuoteConventionName + ) + { + var sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + sourceQuoteConventionName + ); + Assert.IsNotNull(sourceQuoteConvention); + + var targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + targetQuoteConventionName + ); + Assert.IsNotNull(targetQuoteConvention); + + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); + UsfmParser.Parse(normalizedUsfm, firstPassAnalyzer); + + return firstPassAnalyzer.FindBestChapterStrategies(); + } + + public QuotationMarkUpdateStrategy RunFirstPassOnChapter( + List verseTexts, + string sourceQuoteConventionName, + string targetQuoteConventionName + ) + { + var sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + sourceQuoteConventionName + ); + Assert.IsNotNull(sourceQuoteConvention); + + var targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + targetQuoteConventionName + ); + Assert.IsNotNull(targetQuoteConvention); + + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); + + var chapter = new Chapter( + verseTexts.Select(verseText => new Verse([new TextSegment.Builder().SetText(verseText).Build()])).ToList() + ); + + return firstPassAnalyzer.FindBestStrategyForChapter(chapter); + } + + public QuoteConvention GetQuoteConventionByName(string name) + { + var quoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName(name); + Assert.IsNotNull(quoteConvention); + return quoteConvention; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs new file mode 100644 index 00000000..36f3bd4b --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -0,0 +1,839 @@ +using NUnit.Framework; +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuoteConventionChangingUsfmUpdateBlockHandlerTests +{ + [Test] + public void QuotesSpanningVerses() + { + var inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + \v 2 “You shall not eat of any tree of the garden”?» + "; + + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, \n" + + "\\v 2 ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SingleEmbed() + { + var inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + \f + \ft «This is a “footnote”» \f* + of the field which Yahweh God had made. + "; + + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." + ); + + var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void MultipleEmbeds() + { + var inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + \f + \ft «This is a “footnote”» \f* + of the field \f + \ft Second «footnote» here \f* which Yahweh God had made. + "; + + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " + + "“footnote” here \\f* which Yahweh God had made." + ); + + var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void QuotesInTextAndEmbed() + { + var inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really \f + \ft a + «footnote» in the «midst of “text”» \f* said, + “You shall not eat of any tree of the garden”?» + "; + + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void QuotesInMultipleVersesAndEmbed() + { + var inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God + \v 2 really \f + \ft a + «footnote» in the «midst of “text”» \f* said, + “You shall not eat of any tree of the garden”?» + "; + + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God\n" + + "\\v 2 really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // Fallback mode does not consider the nesting of quotation marks, + // but only determines opening/closing marks and maps based on that. + } + + [Test] + public void FallbackStrategySameAsFull() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackStrategyIncorrectlyNested() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + ‘You shall not eat of any tree of the garden’?’ + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackStrategyIncorrectlyNestedSecondCase() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?’ + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?”" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackStrategyUnclosedQuote() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + You shall not eat of any tree of the garden”?’ + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void DefaultQuotationMarkUpdateStrategy() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + var expectedFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + var expectedBasicUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + var expectedSkippedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" + ); + + var observedUsfm = ChangeQuotationMarks(normalizedUsfm, "typewriter_english", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.ApplyFull) + ); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedBasicUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterAction: QuotationMarkUpdateStrategy.Skip) + ); + AssertUsfmEqual(observedUsfm, expectedSkippedUsfm); + } + + [Test] + public void SingleChapterQuotationMarkUpdateStrategy() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + var expectedFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + var expectedBasicUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + var expectedSkippedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(chapterActions: [QuotationMarkUpdateStrategy.ApplyFull]) + ); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(chapterActions: [QuotationMarkUpdateStrategy.ApplyFallback]) + ); + AssertUsfmEqual(observedUsfm, expectedBasicUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(chapterActions: [QuotationMarkUpdateStrategy.Skip]) + ); + AssertUsfmEqual(observedUsfm, expectedSkippedUsfm); + } + + [Test] + public void MultipleChapterSameStrategy() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle"" than any animal + of the field which Yahweh God had made. + \c 2 + \v 1 He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + var expectedFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle\" than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + var expectedFallbackUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterActions: [QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.ApplyFull] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterActions: [QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.ApplyFallback] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFallbackUsfm); + } + + [Test] + public void MultipleChapterMultipleStrategies() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle"" than any animal + of the field which Yahweh God had made. + \c 2 + \v 1 He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + var expectedFullThenFallbackUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle\" than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + var expectedFallbackThenFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + var expectedFallbackThenSkipUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterActions: [QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.ApplyFallback] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFullThenFallbackUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterActions: [QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.ApplyFull] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFallbackThenFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterActions: [QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.Skip] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFallbackThenSkipUsfm); + } + + [Test] + public void ProcessScriptureElement() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "british_english") + ); + var quotationMarkFinder = new MockQuotationMarkFinder(); + quoteConventionChanger.QuotationMarkFinder = quotationMarkFinder; + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: [new UsfmToken("test segment")] + ); + var mockQuotationMarkResolver = new MockQuotationMarkResolver(); + quoteConventionChanger.InternalProcessScriptureElement(updateElement, mockQuotationMarkResolver); + + Assert.That(quotationMarkFinder.NumTimesCalled, Is.EqualTo(1)); + Assert.That(mockQuotationMarkResolver.NumTimesCalled, Is.EqualTo(1)); + Assert.That(quotationMarkFinder.MatchesToReturn[0].TextSegment.Text, Is.EqualTo("this is a ‘test")); + Assert.That(quotationMarkFinder.MatchesToReturn[1].TextSegment.Text, Is.EqualTo("the test ends” here")); + } + + [Test] + public void CreateTextSegmentsBasic() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: [new UsfmToken("test segment")] + ); + var textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + + Assert.That(textSegments, Has.Count.EqualTo(1)); + Assert.That(textSegments[0].Text, Is.EqualTo("test segment")); + Assert.That(textSegments[0].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.NoMarker)); + Assert.That(textSegments[0].MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.IsNull(textSegments[0].PreviousSegment); + Assert.IsNull(textSegments[0].NextSegment); + } + + [Test] + public void CreateTextSegmentsWithPrecedingMarkers() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: + [ + new UsfmToken(UsfmTokenType.Verse, null, null, null), + new UsfmToken(UsfmTokenType.Paragraph, null, null, null), + new UsfmToken("test segment"), + ] + ); + var textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + + Assert.That(textSegments, Has.Count.EqualTo(1)); + Assert.That(textSegments[0].Text, Is.EqualTo("test segment")); + Assert.That(textSegments[0].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Paragraph)); + Assert.That( + textSegments[0].MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Verse, UsfmMarkerType.Paragraph,]) + ); + Assert.IsNull(textSegments[0].PreviousSegment); + Assert.IsNull(textSegments[0].NextSegment); + } + + [Test] + public void CreateTextSegmentsWithMultipleTextTokens() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: + [ + new UsfmToken(UsfmTokenType.Verse, null, null, null), + new UsfmToken(UsfmTokenType.Paragraph, null, null, null), + new UsfmToken("test segment1"), + new UsfmToken(UsfmTokenType.Verse, null, null, null), + new UsfmToken(UsfmTokenType.Character, null, null, null), + new UsfmToken("test segment2"), + new UsfmToken(UsfmTokenType.Paragraph, null, null, null), + ] + ); + var textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + + Assert.That(textSegments, Has.Count.EqualTo(2)); + Assert.That(textSegments[0].Text, Is.EqualTo("test segment1")); + Assert.That(textSegments[0].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Paragraph)); + Assert.That( + textSegments[0].MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Verse, UsfmMarkerType.Paragraph,]) + ); + Assert.IsNull(textSegments[0].PreviousSegment); + Assert.That(textSegments[0].NextSegment, Is.EqualTo(textSegments[1])); + Assert.That(textSegments[1].Text, Is.EqualTo("test segment2")); + Assert.That(textSegments[1].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Character)); + Assert.That( + textSegments[1].MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Verse, UsfmMarkerType.Character,]) + ); + Assert.That(textSegments[1].PreviousSegment, Is.EqualTo(textSegments[0])); + Assert.IsNull(textSegments[1].NextSegment); + } + + [Test] + public void CreateTextSegment() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var usfmToken = new UsfmToken("test segment"); + var segment = quoteConventionChanger.InternalCreateTextSegment(usfmToken); + + Assert.IsNotNull(segment); + Assert.That(segment.Text, Is.EqualTo("test segment")); + Assert.That(segment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.NoMarker)); + Assert.That(segment.MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.That(segment.UsfmToken, Is.EqualTo(usfmToken)); + } + + [Test] + public void SetPreviousAndNextForSegments() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + List segments = + [ + new TextSegment.Builder().SetText("segment 1 text").Build(), + new TextSegment.Builder().SetText("segment 2 text").Build(), + new TextSegment.Builder().SetText("segment 3 text").Build() + ]; + + quoteConventionChanger.InternalSetPreviousAndNextForSegments(segments); + + Assert.IsNull(segments[0].PreviousSegment); + Assert.That(segments[0].NextSegment, Is.EqualTo(segments[1])); + Assert.That(segments[1].PreviousSegment, Is.EqualTo(segments[0])); + Assert.That(segments[1].NextSegment, Is.EqualTo(segments[2])); + Assert.That(segments[2].PreviousSegment, Is.EqualTo(segments[1])); + Assert.IsNull(segments[2].NextSegment); + } + + [Test] + public void CheckForChapterChange() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + Assert.That(quoteConventionChanger.CurrentChapterNumber, Is.EqualTo(0)); + + quoteConventionChanger.InternalCheckForChapterChange(new UsfmUpdateBlock([ScriptureRef.Parse("MAT 1:1")], [])); + + Assert.That(quoteConventionChanger.CurrentChapterNumber, Is.EqualTo(1)); + + quoteConventionChanger.InternalCheckForChapterChange( + new UsfmUpdateBlock([ScriptureRef.Parse("ISA 15:22")], []) + ); + + Assert.That(quoteConventionChanger.CurrentChapterNumber, Is.EqualTo(15)); + } + + [Test] + public void StartNewChapter() + { + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler( + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterActions: + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.ApplyFallback, + ] + ) + ) + ); + + quoteConventionChanger.VerseTextQuotationMarkResolver = new MockQuotationMarkResolver(); + + quoteConventionChanger + .NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed) + .SetText("this text should be erased"); + quoteConventionChanger.VerseTextQuotationMarkResolver.InternalIssues.Add( + QuotationMarkResolutionIssue.IncompatibleQuotationMark + ); + + quoteConventionChanger.InternalStartNewChapter(1); + var segment = quoteConventionChanger.NextScriptureTextSegmentBuilder.Build(); + Assert.That(quoteConventionChanger.CurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.Skip)); + Assert.That(segment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Chapter)); + Assert.That(segment.Text, Is.EqualTo("")); + Assert.That(!segment.MarkersInPrecedingContext.Contains(UsfmMarkerType.Embed)); + Assert.That(quoteConventionChanger.VerseTextQuotationMarkResolver.InternalIssues, Has.Count.EqualTo(0)); + + quoteConventionChanger.InternalStartNewChapter(2); + Assert.That(quoteConventionChanger.CurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + + quoteConventionChanger.InternalStartNewChapter(3); + Assert.That(quoteConventionChanger.CurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback)); + } + + private static string ChangeQuotationMarks( + string normalizedUsfm, + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationMarkUpdateSettings = null + ) + { + quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); + var quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler( + sourceQuoteConventionName, + targetQuoteConventionName, + quotationMarkUpdateSettings + ) + ); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quoteConventionChanger]); + UsfmParser.Parse(normalizedUsfm, updater); + + return updater.GetUsfm(); + } + + private static MockQuoteConventionChangingUsfmUpdateBlockHandler CreateQuoteConventionChangingUsfmUpdateBlockHandler( + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationMarkUpdateSettings = null + ) + { + quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); + var sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + sourceQuoteConventionName + ); + Assert.IsNotNull(sourceQuoteConvention); + + var targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + targetQuoteConventionName + ); + Assert.IsNotNull(targetQuoteConvention); + + return new MockQuoteConventionChangingUsfmUpdateBlockHandler( + sourceQuoteConvention, + targetQuoteConvention, + quotationMarkUpdateSettings + ); + } + + private static void AssertUsfmEqual(string observedUsfm, string expectedUsfm) + { + foreach ((string observedLine, string expectedLine) in observedUsfm.Split("\n").Zip(expectedUsfm.Split("\n"))) + Assert.That(observedLine.Trim(), Is.EqualTo(expectedLine.Trim())); + } + + private class MockQuoteConventionChangingUsfmUpdateBlockHandler( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention, + QuotationMarkUpdateSettings settings + ) : QuoteConventionChangingUsfmUpdateBlockHandler(sourceQuoteConvention, targetQuoteConvention, settings) + { + public QuotationMarkFinder QuotationMarkFinder + { + set => _quotationMarkFinder = value; + } + + public TextSegment.Builder NextScriptureTextSegmentBuilder + { + get => _nextScriptureTextSegmentBuilder; + } + public MockQuotationMarkResolver VerseTextQuotationMarkResolver + { + get => + _verseTextQuotationMarkResolver is MockQuotationMarkResolver mqmr + ? mqmr + : throw new InvalidOperationException( + "Unable to use implementations of IQuotationMarkResolver other than MockQuotationMarkResolver" + ); + set => _verseTextQuotationMarkResolver = value; + } + public int CurrentChapterNumber + { + get => _currentChapterNumber; + set => _currentChapterNumber = value; + } + public QuotationMarkUpdateStrategy CurrentStrategy + { + get => _currentStrategy; + set => _currentStrategy = value; + } + + public void InternalProcessScriptureElement( + UsfmUpdateBlockElement element, + IQuotationMarkResolver quotationMarkResolver + ) + { + ProcessScriptureElement(element, quotationMarkResolver); + } + + public List InternalCreateTextSegments(UsfmUpdateBlockElement element) + { + return CreateTextSegments(element); + } + + public TextSegment InternalCreateTextSegment(UsfmToken usfmToken) + { + return CreateTextSegment(usfmToken); + } + + public List InternalSetPreviousAndNextForSegments(List textSegments) + { + return SetPreviousAndNextForSegments(textSegments); + } + + public void InternalStartNewChapter(int newChapterNum) + { + StartNewChapter(newChapterNum); + } + + public void InternalCheckForChapterChange(UsfmUpdateBlock block) + { + CheckForChapterChange(block); + } + } + + private class MockQuotationMarkFinder : QuotationMarkFinder + { + public int NumTimesCalled; + public readonly List MatchesToReturn; + + public MockQuotationMarkFinder() + : base(new QuoteConventionSet([])) + { + NumTimesCalled = 0; + MatchesToReturn = + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("this is a \"test").Build(), 10, 11), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("the test ends\" here").Build(), 13, 14), + ]; + } + + public override List FindAllPotentialQuotationMarksInTextSegments( + List textSegments + ) + { + NumTimesCalled++; + return MatchesToReturn; + } + } + + private class MockQuotationMarkResolver(IQuotationMarkResolutionSettings? settings = null) + : DepthBasedQuotationMarkResolver( + settings ?? new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([])) + ) + { + public int NumTimesCalled = 0; + + public HashSet InternalIssues => Issues; + + public override void Reset() + { + base.Reset(); + NumTimesCalled = 0; + } + + public override IEnumerable ResolveQuotationMarks( + List quoteMatches + ) + { + NumTimesCalled++; + int currentDepth = 1; + var currentDirection = QuotationMarkDirection.Opening; + foreach (QuotationMarkStringMatch quoteMatch in quoteMatches) + { + yield return quoteMatch.Resolve(currentDepth, currentDirection); + currentDepth++; + currentDirection = + currentDirection == QuotationMarkDirection.Opening + ? QuotationMarkDirection.Closing + : QuotationMarkDirection.Opening; + } + } + + public override HashSet GetIssues() + { + return new HashSet(); + } + } +} From e73cb16acb0466a4b3c43754ad02f9f8742c0ecf Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 28 Jul 2025 12:54:54 -0400 Subject: [PATCH 08/28] Undo small change - passing tests --- .../Corpora/QuotationMarkUpdateFirstPass.cs | 2 +- .../QuotationMarkFinderTests.cs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index b7084a6b..60df3755 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -45,7 +45,7 @@ QuoteConvention targetQuoteConvention } if (depth <= targetQuoteConvention.NumLevels) { - marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth)); + marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth)); //TODO could cut this loop short } } diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs index 884eeee7..4ebd19e8 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -173,15 +173,15 @@ public void ThatAllPossibleQuotationMarksAreIdentified() new TextSegment.Builder() .SetText("This has \u201equotes from \u00bbdifferent conventions < Date: Mon, 28 Jul 2025 16:10:37 -0400 Subject: [PATCH 09/28] Port more changes --- .../DepthBasedQuotationMarkResolver.cs | 18 +- .../PreliminaryQuotationMarkAnalyzer.cs | 6 +- .../QuotationMarkMetadata.cs | 19 +- .../QuotationMarkStringMatch.cs | 8 +- .../QuotationMarkTabulator.cs | 40 +++- .../PunctuationAnalysis/QuoteConvention.cs | 18 +- .../QuoteConventionDetector.cs | 24 +- .../UsfmStructureExtractor.cs | 2 +- .../Corpora/QuotationMarkUpdateFirstPass.cs | 4 +- .../QuotationMarkUpdateResolutionSettings.cs | 7 +- .../Corpora/QuotationMarkUpdateSettings.cs | 8 +- ...onventionChangingUsfmUpdateBlockHandler.cs | 64 ++++-- .../DepthBasedQuotationMarkResolverTests.cs | 4 +- .../QuotationMarkMetadataTests.cs | 4 +- .../PunctuationAnalysis/TextSegmentTests.cs | 12 +- .../Corpora/QuotationDenormalizationTests.cs | 2 +- ...tionChangingUsfmBlockUpdateHandlerTests.cs | 208 ++++++++++++++++-- 17 files changed, 361 insertions(+), 87 deletions(-) diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index 8401c927..ccc59278 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -110,7 +110,7 @@ QuoteContinuerStyle quoteContinuerStyle ); QuoteContinuerMarks.Push(quote); ContinuerStyle = quoteContinuerStyle; - if (QuoteContinuerMarks.Count == quotationMarkResolverState.Quotations.Count) + if (CurrentDepth == quotationMarkResolverState.CurrentDepth) { QuoteContinuerMarks.Clear(); } @@ -158,7 +158,7 @@ QuotationMarkStringMatch nextMatch if (quotationMarkMatch.StartIndex > 0) return false; - // check the next quotation mark match, since quote continuers must appear consecutively + // Check the next quotation mark match, since quote continuers must appear consecutively if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) { if (nextMatch == null || nextMatch.StartIndex != quotationMarkMatch.EndIndex) @@ -194,11 +194,11 @@ QuotationMarkStringMatch nextMatch if (quotationMarkMatch.StartIndex > 0) return false; - // this has only been observed with guillemets so far + // This has only been observed with guillemets so far if (quotationMarkMatch.QuotationMark != "»") return false; - // check the next quotation mark match, since quote continuers must appear consecutively + // Check the next quotation mark match, since quote continuers must appear consecutively if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) { if (nextMatch == null || nextMatch.StartIndex != quotationMarkMatch.EndIndex) @@ -227,7 +227,7 @@ public bool IsOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) if (!_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) return false; - // if the quote is ambiguous, use whitespace as clue + // If the quote is ambiguous, use whitespace as clue if (_settings.IsValidClosingQuotationMark(quotationMarkMatch)) { return ( @@ -244,7 +244,7 @@ public bool IsClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) return false; - // if the quote is ambiguous, use whitespace as clue + // If the quote is ambiguous, use whitespace as clue if (_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) { return ( @@ -332,13 +332,13 @@ public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationM return true; } - // potential final s possessive (e.g. Moses') + // Potential final s possessive (e.g. Moses') if ( quotationMarkMatch.PreviousCharacterMatches(new Regex(@"s", RegexOptions.Compiled)) && (quotationMarkMatch.HasTrailingWhitespace() || quotationMarkMatch.HasTrailingPunctuation()) ) { - // check whether it could be a closing quotation mark + // Check whether it could be a closing quotation mark if (!_quotationMarkResolverState.HasOpenQuotationMark) return true; if ( @@ -362,7 +362,7 @@ public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationM } } - // for languages that use apostrophes at teh start and end of words //TODO misspelled comment + // For languages that use apostrophes at teh start and end of words //TODO misspelled comment if ( !_quotationMarkResolverState.HasOpenQuotationMark && quotationMarkMatch.QuotationMark == "'" || _quotationMarkResolverState.HasOpenQuotationMark diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs index bf3ea5fe..f7c78a5e 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -230,7 +230,7 @@ private void GroupQuotationMarks(List quotationMarks) ) ) { - // handle cases of identical opening/closing marks + // Handle cases of identical opening/closing marks if ( matches1.Count == 2 && _quoteConventions.IsQuotationMarkDirectionAmbiguous(mark1) @@ -241,11 +241,11 @@ private void GroupQuotationMarks(List quotationMarks) continue; } - // skip verses where quotation mark pairs are ambiguous + // Skip verses where quotation mark pairs are ambiguous if (matches1.Count > 1) continue; - // find matching closing marks + // Find matching closing marks foreach ( (string mark2, List matches2) in _groupedQuotationMarks.Select(kvp => (kvp.Key, kvp.Value) diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs index b58ec1e1..991f546d 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs @@ -6,8 +6,8 @@ public class QuotationMarkMetadata public int Depth { get; } public QuotationMarkDirection Direction { get; } public TextSegment TextSegment { get; } - public int StartIndex { get; } - public int EndIndex { get; } + public int StartIndex { get; private set; } + public int EndIndex { get; private set; } public QuotationMarkMetadata( string quotationMark, @@ -26,6 +26,14 @@ int endIndex EndIndex = endIndex; } + public int Length => EndIndex - StartIndex; + + public void ShiftIndices(int shiftAmount) + { + StartIndex += shiftAmount; + EndIndex += shiftAmount; + } + public override bool Equals(object obj) { if (!(obj is QuotationMarkMetadata other)) @@ -56,11 +64,14 @@ public void UpdateQuotationMark(QuoteConvention quoteConvention) { string updatedQuotationMark = quoteConvention.GetExpectedQuotationMark(Depth, Direction); if (updatedQuotationMark.Equals(QuotationMark)) - { return; - } TextSegment.ReplaceSubstring(StartIndex, EndIndex, updatedQuotationMark); + + if (updatedQuotationMark.Length != QuotationMark.Length) + { + EndIndex += updatedQuotationMark.Length - QuotationMark.Length; + } } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs index 117c1f01..97cc99af 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -7,7 +7,7 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuotationMarkStringMatch { - // No LatinLetterPattern or LetternPattern because C# does not support it in the same way as Python. Using UnicodeInfo to mirror machine.py + // No LatinLetterPattern or LetterPattern because C# does not support it in the same way as Python. Using UnicodeInfo to mirror machine.py private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); @@ -62,7 +62,7 @@ public string PreviousCharacter { get { - if (StartIndex == 0) + if (IsAtStartOfSegment) { TextSegment previousSegment = TextSegment.PreviousSegment; if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) @@ -101,14 +101,14 @@ public bool LeadingSubstringMatches(Regex regexPattern) => public bool TrailingSubstringMatches(Regex regexPattern) => regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); - // this assumes that the two matches occur in the same verse + // This assumes that the two matches occur in the same verse public bool Precedes(QuotationMarkStringMatch other) { return TextSegment.IndexInVerse < other.TextSegment.IndexInVerse || (TextSegment.IndexInVerse == other.TextSegment.IndexInVerse && StartIndex < other.StartIndex); } - // not used, but a useful method for debugging + // Not used, but a useful method for debugging public string Context() { int contextStartIndex = Math.Max(StartIndex - 10, 0); diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs index dc059bb3..995cc35c 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Text; using SIL.Extensions; namespace SIL.Machine.Corpora.PunctuationAnalysis @@ -82,7 +83,7 @@ public double CalculateSimilarity(QuoteConvention quoteConvention) { string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction); - // give higher weight to shallower depths, since deeper marks are more likely to be mistakes + // Give higher weight to shallower depths, since deeper marks are more likely to be mistakes weightedDifference += ( _quotationCountsByDepthAndDirection[(depth, direction)] .CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth) @@ -95,5 +96,42 @@ public double CalculateSimilarity(QuoteConvention quoteConvention) } return 1 - (weightedDifference / totalWeight); } + + private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction) + { + return _quotationCountsByDepthAndDirection.ContainsKey((depth, direction)); + } + + private ( + string openingQuotationMark, + int observedOpeningCount, + int totalOpeningCount + ) FindMostCommonQuotationMarkWithDepthAndDirection(int depth, QuotationMarkDirection direction) + { + return _quotationCountsByDepthAndDirection[(depth, direction)].FindBestQuotationMarkProportion(); + } + + public string GetSummaryMessage() + { + var message = new StringBuilder(); + for (int depth = 1; depth < 5; depth++) + { + (string openingQuotationMark, int observedOpeningCount, int totalOpeningCount) = + FindMostCommonQuotationMarkWithDepthAndDirection(depth, QuotationMarkDirection.Opening); + (string closingQuotationMark, int observedClosingCount, int totalClosingCount) = + FindMostCommonQuotationMarkWithDepthAndDirection(depth, QuotationMarkDirection.Closing); + + if ( + DepthAndDirectionObserved(depth, QuotationMarkDirection.Opening) + && DepthAndDirectionObserved(depth, QuotationMarkDirection.Closing) + ) + { + message.AppendLine( + $"The most common level {depth} quotation marks are {openingQuotationMark} ({observedOpeningCount} of {totalOpeningCount} opening marks) and {closingQuotationMark} ({observedClosingCount} of {totalClosingCount} closing marks)" + ); + } + } + return message.ToString(); + } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs index e31520e3..9e7c02be 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs @@ -100,12 +100,24 @@ public bool IncludesClosingQuotationMark(string closingQuotationMark) public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) { var depths = new HashSet(); - foreach ((int depth, SingleLevelQuoteConvention level) in LevelConventions.Select((l, i) => (i + 1, l))) + foreach ( + (int depth, SingleLevelQuoteConvention levelConvention) in LevelConventions.Select((l, i) => (i + 1, l)) + ) { - if (direction == QuotationMarkDirection.Opening && level.OpeningQuotationMark == quotationMark) + if ( + direction == QuotationMarkDirection.Opening + && levelConvention.OpeningQuotationMark == quotationMark + ) + { depths.Add(depth); - else if (direction == QuotationMarkDirection.Closing && level.ClosingQuotationMark == quotationMark) + } + else if ( + direction == QuotationMarkDirection.Closing + && levelConvention.ClosingQuotationMark == quotationMark + ) + { depths.Add(depth); + } } return depths; } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs index dcef862b..ef5830cd 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs @@ -5,13 +5,19 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis { public class QuoteConventionAnalysis { - public QuoteConvention BestQuoteConvention { get; set; } - public double BestQuoteConventionScore { get; set; } + public QuoteConvention BestQuoteConvention { get; private set; } + public double BestQuoteConventionScore { get; private set; } + public string AnalysisSummary { get; private set; } - public QuoteConventionAnalysis(QuoteConvention bestQuoteConvention, double bestQuoteConventionScore) + public QuoteConventionAnalysis( + QuoteConvention bestQuoteConvention, + double bestQuoteConventionScore, + string analysisSummary + ) { BestQuoteConvention = bestQuoteConvention; BestQuoteConventionScore = bestQuoteConventionScore; + AnalysisSummary = analysisSummary; } } @@ -27,12 +33,12 @@ public QuoteConventionDetector() private void CountQuotationMarksInChapters(List chapters) { - QuoteConventionSet possibleQuoteConvetions = new PreliminaryQuotationMarkAnalyzer( + QuoteConventionSet possibleQuoteConventions = new PreliminaryQuotationMarkAnalyzer( StandardQuoteConventions.QuoteConventions ).NarrowDownPossibleQuoteConventions(chapters); foreach (Chapter chapter in chapters) - CountQuotationMarksInChapter(chapter, possibleQuoteConvetions); + CountQuotationMarksInChapter(chapter, possibleQuoteConventions); } private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet possibleQuoteConventions) @@ -58,7 +64,13 @@ public QuoteConventionAnalysis DetectQuotationConvention() StandardQuoteConventions.QuoteConventions.FindMostSimilarConvention(_quotationMarkTabulator); if (score > 0 && bestQuoteConvention != null) - return new QuoteConventionAnalysis(bestQuoteConvention, score); + { + return new QuoteConventionAnalysis( + bestQuoteConvention, + score, + _quotationMarkTabulator.GetSummaryMessage() + ); + } return null; } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs index 5e743dc7..98cc4bcf 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -106,7 +106,7 @@ public void Text(UsfmParserState state, string text) { _nextTextSegmentBuilder.SetText(text); TextSegment textSegment = _nextTextSegmentBuilder.Build(); - // don't look past verse boundaries, to enable identical functionality in the + // Don't look past verse boundaries, to enable identical functionality in the // online one-verse-at-a-time (QuotationMarkDenormalizationScriptureUpdateBlockHandler) // and offline whole-book-at-once settings (QuoteConventionDetector) if (_textSegments.Count > 0 && !textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index 60df3755..1a764376 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -24,7 +24,7 @@ QuoteConvention targetQuoteConvention new QuoteConventionSet(new List { sourceQuoteConvention, targetQuoteConvention }) ); _quotationMarkResolver = new DepthBasedQuotationMarkResolver( - new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention, targetQuoteConvention) + new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention) ); WillFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention); } @@ -69,7 +69,7 @@ public QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) _quotationMarkResolver.Reset(); - // use ToList() to force evaluation of the generator + // Use ToList() to force evaluation of the generator _quotationMarkResolver.ResolveQuotationMarks(quotationMarkMatches).ToList(); return ChooseBestStrategyBasedOnObservedIssues(_quotationMarkResolver.GetIssues()); diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs index 6713c56e..33c065f7 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs @@ -7,16 +7,11 @@ public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSet { private readonly QuoteConvention _sourceQuoteConvention; private readonly QuoteConventionSet _quoteConventionSingletonSet; - private readonly QuoteConvention _targetQuoteConvention; - public QuotationMarkUpdateResolutionSettings( - QuoteConvention sourceQuoteConvention, - QuoteConvention targetQuoteConvention - ) + public QuotationMarkUpdateResolutionSettings(QuoteConvention sourceQuoteConvention) { _sourceQuoteConvention = sourceQuoteConvention; _quoteConventionSingletonSet = new QuoteConventionSet(new List { sourceQuoteConvention }); - _targetQuoteConvention = targetQuoteConvention; } public bool AreMarksAValidPair(string openingMark, string closingMark) diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs index 3394850f..fc8b50fb 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs @@ -8,12 +8,12 @@ public class QuotationMarkUpdateSettings private readonly List _chapterActions; public QuotationMarkUpdateSettings( - QuotationMarkUpdateStrategy defaultChapterAction = QuotationMarkUpdateStrategy.ApplyFull, - List chapterActions = null + QuotationMarkUpdateStrategy defaultChapterStrategy = QuotationMarkUpdateStrategy.ApplyFull, + List chapterStrategies = null ) { - _defaultChapterAction = defaultChapterAction; - _chapterActions = chapterActions ?? new List(); + _defaultChapterAction = defaultChapterStrategy; + _chapterActions = chapterStrategies ?? new List(); } public QuotationMarkUpdateStrategy GetActionForChapter(int chapterNumber) diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs index 83f89169..d7b9f289 100644 --- a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Linq; using SIL.Machine.Corpora.PunctuationAnalysis; namespace SIL.Machine.Corpora @@ -34,8 +35,7 @@ QuotationMarkUpdateSettings settings _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); IQuotationMarkResolutionSettings resolutionSettings = new QuotationMarkUpdateResolutionSettings( - sourceQuoteConvention, - targetQuoteConvention + sourceQuoteConvention ); // Each embed represents a separate context for quotation marks @@ -95,14 +95,10 @@ IQuotationMarkResolver quotationMarkResolver List textSegments = CreateTextSegments(element); List quotationMarkMatches = _quotationMarkFinder.FindAllPotentialQuotationMarksInTextSegments(textSegments); - foreach ( - QuotationMarkMetadata resolvedQuotationMark in quotationMarkResolver.ResolveQuotationMarks( - quotationMarkMatches - ) - ) - { - resolvedQuotationMark.UpdateQuotationMark(_targetQuoteConvention); - } + List resolvedQuotationMarkMatches = quotationMarkResolver + .ResolveQuotationMarks(quotationMarkMatches) + .ToList(); + UpdateQuotationMarks(resolvedQuotationMarkMatches); } protected List CreateTextSegments(UsfmUpdateBlockElement element) @@ -134,6 +130,40 @@ protected List CreateTextSegments(UsfmUpdateBlockElement element) return SetPreviousAndNextForSegments(textSegments); } + public void UpdateQuotationMarks(List resolvedQuotationMarkMatches) + { + foreach ( + ( + int quotationMarkIndex, + QuotationMarkMetadata resolvedQuotationMarkMatch + ) in resolvedQuotationMarkMatches.Select((r, i) => (i, r)) + ) + { + int previousLength = resolvedQuotationMarkMatch.Length; + resolvedQuotationMarkMatch.UpdateQuotationMark(_targetQuoteConvention); + int updatedLength = resolvedQuotationMarkMatch.Length; + + if (previousLength != updatedLength) + { + ShiftQuotationMarkMetadataIndices( + resolvedQuotationMarkMatches.Skip(quotationMarkIndex + 1).ToList(), + updatedLength - previousLength + ); + } + } + } + + private void ShiftQuotationMarkMetadataIndices( + List quotationMarkMetadataList, + int shiftAmount + ) + { + foreach (QuotationMarkMetadata quotationMarkMetadata in quotationMarkMetadataList) + { + quotationMarkMetadata.ShiftIndices(shiftAmount); + } + } + protected TextSegment CreateTextSegment(UsfmToken token) { TextSegment textSegmentToReturn = null; @@ -165,15 +195,15 @@ protected void CheckForChapterChange(UsfmUpdateBlock block) { if (scriptureRef.ChapterNum != _currentChapterNumber) { - _currentChapterNumber = scriptureRef.ChapterNum; - StartNewChapter(_currentChapterNumber); + StartNewChapter(scriptureRef.ChapterNum); } } } - protected void StartNewChapter(int newChapterNum) + protected void StartNewChapter(int newChapterNumber) { - _currentStrategy = _settings.GetActionForChapter(newChapterNum); + _currentChapterNumber = newChapterNumber; + _currentStrategy = _settings.GetActionForChapter(newChapterNumber); _verseTextQuotationMarkResolver.Reset(); _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); @@ -185,14 +215,14 @@ private void CheckForVerseChange(UsfmUpdateBlock block) { if (scriptureRef.ChapterNum == _currentChapterNumber && scriptureRef.VerseNum != _currentVerseNumber) { - _currentVerseNumber = scriptureRef.VerseNum; - StartNewVerse(); + StartNewVerse(scriptureRef.VerseNum); } } } - private void StartNewVerse() + private void StartNewVerse(int newVerseNumber) { + _currentVerseNumber = newVerseNumber; _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); } } diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs index eea2f897..dbc29c01 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -415,7 +415,7 @@ public void IsEnglishQuotationContinuer() ); var categorizerForDenorm = new QuotationMarkCategorizer( - new QuotationMarkUpdateResolutionSettings(standardEnglish, standardEnglish), + new QuotationMarkUpdateResolutionSettings(standardEnglish), resolverState, continuerState ); @@ -720,7 +720,7 @@ public void IsSpanishQuotationContinuer() ); var categorizerForDenorm = new QuotationMarkCategorizer( - new QuotationMarkUpdateResolutionSettings(westernEuropeanQuoteConvention, westernEuropeanQuoteConvention), + new QuotationMarkUpdateResolutionSettings(westernEuropeanQuoteConvention), resolverState, continuerState ); diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs index 12737c5d..9c195afe 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs @@ -56,7 +56,7 @@ public void UpdateQuotationMarkWithMultiCharacterQuotationMarks() quotationMarkMetadata.UpdateQuotationMark(GetQuoteConventionByName("typewriter_french")); Assert.That(quotationMarkMetadata.TextSegment.Text, Is.EqualTo("He said to the woman, <?>> + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_french", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void MultiCharacterQuotationMarksInTargetQuoteConvention() + { + var normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + var expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ); + + var observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "standard_english", + "typewriter_french", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + [Test] public void ProcessScriptureElement() { @@ -589,6 +643,128 @@ public void SetPreviousAndNextForSegments() Assert.IsNull(segments[2].NextSegment); } + [Test] + public void UpdateQuotationMarks() + { + QuoteConventionChangingUsfmUpdateBlockHandler multiCharToSingleCharQuoteConventionChanger = + CreateQuoteConventionChangingUsfmUpdateBlockHandler("typewriter_french", "standard_english"); + + TextSegment multiCharacterTextSegment = new TextSegment.Builder() + .SetText("this < >>") + .Build(); + + List multiCharacterQuotationMarks = + [ + new QuotationMarkMetadata( + quotationMark: "<<", + depth: 1, + direction: QuotationMarkDirection.Opening, + textSegment: multiCharacterTextSegment, + startIndex: 5, + endIndex: 7 + ), + new QuotationMarkMetadata( + quotationMark: "<", + depth: 2, + direction: QuotationMarkDirection.Opening, + textSegment: multiCharacterTextSegment, + startIndex: 10, + endIndex: 11 + ), + new QuotationMarkMetadata( + quotationMark: ">", + depth: 2, + direction: QuotationMarkDirection.Closing, + textSegment: multiCharacterTextSegment, + startIndex: 25, + endIndex: 26 + ), + new QuotationMarkMetadata( + quotationMark: ">>", + depth: 1, + direction: QuotationMarkDirection.Closing, + textSegment: multiCharacterTextSegment, + startIndex: 27, + endIndex: 29 + ) + ]; + + multiCharToSingleCharQuoteConventionChanger.UpdateQuotationMarks(multiCharacterQuotationMarks); + + Assert.That(multiCharacterTextSegment.Text, Is.EqualTo("this “is ‘a test segment’ ”")); + Assert.That(multiCharacterQuotationMarks[0].StartIndex, Is.EqualTo(5)); + Assert.That(multiCharacterQuotationMarks[0].EndIndex, Is.EqualTo(6)); + Assert.That(multiCharacterQuotationMarks[0].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[1].StartIndex, Is.EqualTo(9)); + Assert.That(multiCharacterQuotationMarks[1].EndIndex, Is.EqualTo(10)); + Assert.That(multiCharacterQuotationMarks[1].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[2].StartIndex, Is.EqualTo(24)); + Assert.That(multiCharacterQuotationMarks[2].EndIndex, Is.EqualTo(25)); + Assert.That(multiCharacterQuotationMarks[2].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[3].StartIndex, Is.EqualTo(26)); + Assert.That(multiCharacterQuotationMarks[3].EndIndex, Is.EqualTo(27)); + Assert.That(multiCharacterQuotationMarks[3].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + + QuoteConventionChangingUsfmUpdateBlockHandler singleCharToMultiCharQuoteConventionChanger = + CreateQuoteConventionChangingUsfmUpdateBlockHandler("typewriter_french", "standard_english"); + + TextSegment singleCharacterTextSegment = new TextSegment.Builder() + .SetText("this “is ‘a test segment’ ”") + .Build(); + + List singleCharacterQuotationMarks = + [ + new QuotationMarkMetadata( + quotationMark: "“", + depth: 1, + direction: QuotationMarkDirection.Opening, + textSegment: singleCharacterTextSegment, + startIndex: 5, + endIndex: 6 + ), + new QuotationMarkMetadata( + quotationMark: "‘", + depth: 2, + direction: QuotationMarkDirection.Opening, + textSegment: singleCharacterTextSegment, + startIndex: 9, + endIndex: 10 + ), + new QuotationMarkMetadata( + quotationMark: "’", + depth: 2, + direction: QuotationMarkDirection.Closing, + textSegment: singleCharacterTextSegment, + startIndex: 24, + endIndex: 25 + ), + new QuotationMarkMetadata( + quotationMark: "”", + depth: 1, + direction: QuotationMarkDirection.Closing, + textSegment: singleCharacterTextSegment, + startIndex: 26, + endIndex: 27 + ) + ]; + + singleCharToMultiCharQuoteConventionChanger.UpdateQuotationMarks(singleCharacterQuotationMarks); + + Assert.That(singleCharacterTextSegment.Text, Is.EqualTo("this < >>")); + Assert.That(singleCharacterQuotationMarks[0].StartIndex, Is.EqualTo(5)); + Assert.That(singleCharacterQuotationMarks[0].EndIndex, Is.EqualTo(7)); + Assert.That(singleCharacterQuotationMarks[0].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + Assert.That(singleCharacterQuotationMarks[1].StartIndex, Is.EqualTo(10)); + Assert.That(singleCharacterQuotationMarks[1].EndIndex, Is.EqualTo(11)); + Assert.That(singleCharacterQuotationMarks[1].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[2].StartIndex, Is.EqualTo(25)); + Assert.That(singleCharacterQuotationMarks[2].EndIndex, Is.EqualTo(26)); + Assert.That(singleCharacterQuotationMarks[2].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + Assert.That(singleCharacterQuotationMarks[3].StartIndex, Is.EqualTo(27)); + Assert.That(singleCharacterQuotationMarks[3].EndIndex, Is.EqualTo(29)); + Assert.That(singleCharacterQuotationMarks[3].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + } + [Test] public void CheckForChapterChange() { @@ -617,7 +793,7 @@ public void StartNewChapter() "standard_english", "standard_english", new QuotationMarkUpdateSettings( - chapterActions: + chapterStrategies: [ QuotationMarkUpdateStrategy.Skip, QuotationMarkUpdateStrategy.ApplyFull, From d209551319d021c522252e462b0777f4e24a2ed6 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 28 Jul 2025 16:16:10 -0400 Subject: [PATCH 10/28] Port better guessing for ambiguous quotation marks --- .../Corpora/FallbackQuotationMarkResolver.cs | 13 +++++++++++++ .../DepthBasedQuotationMarkResolver.cs | 2 ++ 2 files changed, 15 insertions(+) diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs index 7b9b4b4e..f9b4af83 100644 --- a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -64,6 +64,19 @@ public IEnumerable ResolveQuotationMark(QuotationMarkStri } else { + // Make a reasonable guess about the direction of the quotation mark + if (_lastQuotationMark == null || _lastQuotationMark.Direction == QuotationMarkDirection.Closing) + { + QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch); + if (quotationMark != null) + yield return quotationMark; + } + else + { + QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch); + if (quotationMark != null) + yield return quotationMark; + } _issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index ccc59278..1ca3fbcc 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -210,6 +210,8 @@ QuotationMarkStringMatch nextMatch private bool MeetsQuoteContinuerPrerequisites(QuotationMarkStringMatch quotationMarkMatch) { + if (_quoteContinuerState.CurrentDepth >= _quotationMarkResolverState.CurrentDepth) + return false; if ( _settings.ShouldRelyOnParagraphMarkers() && !quotationMarkMatch.TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) From 347e14cae44d5d2db2304e6aaf4ebe0bd44086a3 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 28 Jul 2025 16:51:09 -0400 Subject: [PATCH 11/28] Remove TODOs --- .../DepthBasedQuotationMarkResolver.cs | 2 +- .../Corpora/QuotationMarkUpdateFirstPass.cs | 2 +- .../DepthBasedQuotationMarkResolverTests.cs | 3 +-- .../QuotationMarkStringMatchTests.cs | 2 +- .../QuoteConventionTests.cs | 20 ------------------- .../PunctuationAnalysis/TextSegmentTests.cs | 4 ++-- .../Corpora/QuotationDenormalizationTests.cs | 2 +- 7 files changed, 7 insertions(+), 28 deletions(-) diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index 1ca3fbcc..a7482bb2 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -364,7 +364,7 @@ public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationM } } - // For languages that use apostrophes at teh start and end of words //TODO misspelled comment + // For languages that use apostrophes at the start and end of words if ( !_quotationMarkResolverState.HasOpenQuotationMark && quotationMarkMatch.QuotationMark == "'" || _quotationMarkResolverState.HasOpenQuotationMark diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index 1a764376..dbc2a6fb 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -45,7 +45,7 @@ QuoteConvention targetQuoteConvention } if (depth <= targetQuoteConvention.NumLevels) { - marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth)); //TODO could cut this loop short + marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth)); } } diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs index dbc29c01..7775e9a4 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -1239,7 +1239,7 @@ public void IsOpeningQuote() standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) ) - ); //TODO 0,1 not 1,2? + ); Assert.IsTrue( standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2) @@ -3145,7 +3145,6 @@ public void TooDeepNestingIssue() new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 6, 7), new QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, textSegment, 10, 11), new QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, textSegment, 13, 14), - // new QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, textSegment, 20, 21), //TODO Why commented out? ] ) ); diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs index 60f48199..d0002f08 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -267,7 +267,7 @@ public void DoesTrailingSubstringMatch() new TextSegment.Builder().SetText("sample text").Build(), 11, 11 - ); //TODO 12 does not exist? + ); Assert.IsFalse(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@".+", RegexOptions.Compiled))); quotationMarkStringMatch = new QuotationMarkStringMatch( diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs index a6c11005..53f8c532 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs @@ -419,24 +419,4 @@ public void Normalize() Is.EqualTo("\"") ); } - - // [Test] - // public void PrintSummary() - // { - // var quoteConvention = new QuoteConvention( - // "test-quote-convention", //TODO why kebab case? - // [ - // new SingleLevelQuoteConvention("\u201c", "\u201D"), - // new SingleLevelQuoteConvention("\u2018", "\u2019"), - // new SingleLevelQuoteConvention("\u201D", "\u201D"), - // ] - // ); - // var expectedSummaryMessage = ( - // "test-quote-convention\n" - // + "\u201CFirst-level quote\u201D\n" - // + "\u2018Second-level quote\u2019\n" - // + "\u201DThird-level quote\u201D\n" - // ); - // Assert.That(quoteConvention.ToString(), Is.EqualTo(expectedSummaryMessage)); - // } } diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs index 1722feaa..d96d8cbd 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs @@ -92,8 +92,8 @@ public void Equals() var sameTextSegment = new TextSegment.Builder().SetText("text1").Build(); var differentTextSegment = new TextSegment.Builder().SetText("different text").Build(); - // Assert.That(basicSegment, Is.EqualTo(basicSegment)); //TODO fix - // Assert.That(basicSegment , Is.Not.EqualTo(new UsfmToken("text1"))); //TODO also here + // Assert.That(basicSegment, Is.EqualTo(basicSegment)); + // Assert.That(basicSegment , Is.Not.EqualTo(new UsfmToken("text1"))); Assert.That(basicSegment, Is.EqualTo(sameTextSegment)); Assert.That(basicSegment, Is.Not.EqualTo(differentTextSegment)); diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs index 64e854a2..4f2fd136 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -55,6 +55,6 @@ of the field which Yahweh God had made. var actualDenormalizedUsfm = updater.GetUsfm(); - Assert.That(actualDenormalizedUsfm, Is.EqualTo(expectedDenormalizedUsfm).IgnoreLineEndings()); //TODO use ignore_line_endings + Assert.That(actualDenormalizedUsfm, Is.EqualTo(expectedDenormalizedUsfm).IgnoreLineEndings()); } } From 6e4798b6438e18f96571b155323ec349828f4154 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 28 Jul 2025 18:33:17 -0400 Subject: [PATCH 12/28] Passing tests --- .../Corpora/FallbackQuotationMarkResolver.cs | 44 +- .../QuotationMarkTabulator.cs | 4 +- .../FallbackQuotationMarkResolverTests.cs | 476 ++++++++++++++++++ ...cs => QuotationConventionDetectorTests.cs} | 2 +- ...tionChangingUsfmBlockUpdateHandlerTests.cs | 4 +- 5 files changed, 504 insertions(+), 26 deletions(-) create mode 100644 tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs rename tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/{QuoteConventionDetectorTests.cs => QuotationConventionDetectorTests.cs} (99%) diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs index f9b4af83..65c753c8 100644 --- a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -7,20 +7,20 @@ namespace SIL.Machine.Corpora public class FallbackQuotationMarkResolver : IQuotationMarkResolver { private readonly IQuotationMarkResolutionSettings _settings; - private QuotationMarkMetadata _lastQuotationMark; - private readonly HashSet _issues; + public QuotationMarkMetadata LastQuotationMark { get; set; } + public HashSet Issues { get; } public FallbackQuotationMarkResolver(IQuotationMarkResolutionSettings settings) { _settings = settings; - _lastQuotationMark = null; - _issues = new HashSet(); + LastQuotationMark = null; + Issues = new HashSet(); } public void Reset() { - _lastQuotationMark = null; - _issues.Clear(); + LastQuotationMark = null; + Issues.Clear(); } public IEnumerable ResolveQuotationMarks( @@ -47,7 +47,7 @@ public IEnumerable ResolveQuotationMark(QuotationMarkStri } else { - _issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); } } else if (IsClosingQuotationMark(quotationMarkMatch)) @@ -59,13 +59,13 @@ public IEnumerable ResolveQuotationMark(QuotationMarkStri } else { - _issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); } } else { // Make a reasonable guess about the direction of the quotation mark - if (_lastQuotationMark == null || _lastQuotationMark.Direction == QuotationMarkDirection.Closing) + if (LastQuotationMark == null || LastQuotationMark.Direction == QuotationMarkDirection.Closing) { QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch); if (quotationMark != null) @@ -77,11 +77,11 @@ public IEnumerable ResolveQuotationMark(QuotationMarkStri if (quotationMark != null) yield return quotationMark; } - _issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); + Issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); } } - private bool IsOpeningQuotationMark(QuotationMarkStringMatch match) + public bool IsOpeningQuotationMark(QuotationMarkStringMatch match) { if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) { @@ -100,19 +100,19 @@ private bool IsOpeningQuotationMark(QuotationMarkStringMatch match) return false; } - private bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatch match) + public bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatch match) { - if (_lastQuotationMark == null || _lastQuotationMark.Direction != QuotationMarkDirection.Opening) + if (LastQuotationMark == null || LastQuotationMark.Direction != QuotationMarkDirection.Opening) { return false; } - return _lastQuotationMark.TextSegment == match.TextSegment - && _lastQuotationMark.EndIndex == match.StartIndex; + return LastQuotationMark.TextSegment.Equals(match.TextSegment) + && LastQuotationMark.EndIndex == match.StartIndex; } - private bool IsClosingQuotationMark(QuotationMarkStringMatch match) + public bool IsClosingQuotationMark(QuotationMarkStringMatch match) { - if (_settings.IsValidClosingQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) + if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) { return (match.HasTrailingWhitespace() || match.HasTrailingPunctuation() || match.IsAtEndOfSegment) && !match.HasLeadingWhitespace(); @@ -125,7 +125,7 @@ private bool IsClosingQuotationMark(QuotationMarkStringMatch match) return false; } - private QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotationMarkMatch) + public QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotationMarkMatch) { HashSet possibleDepths = _settings.GetPossibleDepths( quotationMarkMatch.QuotationMark, @@ -138,11 +138,11 @@ private QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotat possibleDepths.Min(), QuotationMarkDirection.Opening ); - _lastQuotationMark = quotationMark; + LastQuotationMark = quotationMark; return quotationMark; } - private QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotationMarkMatch) + public QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotationMarkMatch) { HashSet possibleDepths = _settings.GetPossibleDepths( quotationMarkMatch.QuotationMark, @@ -155,13 +155,13 @@ private QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotat possibleDepths.Min(), QuotationMarkDirection.Closing ); - _lastQuotationMark = quote; + LastQuotationMark = quote; return quote; } public HashSet GetIssues() { - return _issues; + return Issues; } } } diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs index 995cc35c..7fd91b27 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -108,7 +108,9 @@ private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection directi int totalOpeningCount ) FindMostCommonQuotationMarkWithDepthAndDirection(int depth, QuotationMarkDirection direction) { - return _quotationCountsByDepthAndDirection[(depth, direction)].FindBestQuotationMarkProportion(); + return _quotationCountsByDepthAndDirection.TryGetValue((depth, direction), out QuotationMarkCounts counts) + ? counts.FindBestQuotationMarkProportion() + : (null, 0, 0); } public string GetSummaryMessage() diff --git a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs new file mode 100644 index 00000000..ac0bd815 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs @@ -0,0 +1,476 @@ +using NUnit.Framework; +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class FallbackQuotationMarkResolverTests +{ + [Test] + public void Reset() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention) + ); + + basicQuotationMarkResolver.LastQuotationMark = new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\"'test text\"").Build(), + 0, + 1 + ); + basicQuotationMarkResolver.Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + + basicQuotationMarkResolver.Reset(); + Assert.IsNull(basicQuotationMarkResolver.LastQuotationMark); + Assert.That(basicQuotationMarkResolver.Issues.Count, Is.EqualTo(0)); + } + + [Test] + public void SimpleQuotationMarkResolutionWithNoPreviousMark() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var actualResolvedQuotationMarks = basicQuotationMarkResolver + .ResolveQuotationMarks( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \" text").Build(), 5, 6),] + ) + .ToList(); + List expectedResolvedQuotationMarks = + [ + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("test \" text").Build(), + 5, + 6 + ) + ]; + + AssertResolvedQuotationMarksEqual(actualResolvedQuotationMarks, expectedResolvedQuotationMarks); + } + + [Test] + public void SimpleQuotationMarkResolutionWithPreviousOpeningMark() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var actualResolvedQuotationMarks = basicQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test \" text").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test \" text").Build(), 6, 7), + ] + ) + .ToList(); + List expectedResolvedQuotationMarks = + [ + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\"test \" text").Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().SetText("\"test \" text").Build(), + 6, + 7 + ), + ]; + + AssertResolvedQuotationMarksEqual(actualResolvedQuotationMarks, expectedResolvedQuotationMarks); + } + + [Test] + public void SimpleQuotationMarkResolutionWithPreviousClosingMark() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var actualResolvedQuotationMarks = basicQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test\" \" text").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test\" \" text").Build(), 6, 7), + ] + ) + .ToList(); + List expectedResolvedQuotationMarks = + [ + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().SetText("test\" \" text").Build(), + 4, + 5 + ), + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("test\" \" text").Build(), + 6, + 7 + ) + ]; + + AssertResolvedQuotationMarksEqual(actualResolvedQuotationMarks, expectedResolvedQuotationMarks); + } + + [Test] + public void IsOpeningQuote() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + // valid opening quote at start of segment + var quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1); + Assert.IsTrue(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // opening quote with leading whitespace + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \"text\"").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // opening quote with quote introducer + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test:\"text\"").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // QuotationMarkStringMatch indices don't indicate a quotation mark + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \"text\"").Build(), 0, 1); + Assert.IsFalse(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // the quotation mark is not valid under the current quote convention + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("").Build(), 10, 11); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // no trailing whitespace after quotation mark + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test\"text").Build(), 5, 6); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // opening quote at the start of the segment + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // opening quote with leading whitespace + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \"text\"").Build(), 5, 6); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + } + + [Test] + public void IsClosingQuoteWithUnambiguousQuoteConvention() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([englishQuoteConvention])) + ); + + // unambiguous closing quote at end of segment + var quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("“test text”").Build(), 10, 11); + Assert.IsTrue(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // unambiguous closing quote with trailing whitespace + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("“test” text").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // unambiguous closing quote without the "correct" context + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("“test”text").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // unambiguous opening quote + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test “text”").Build(), 5, 6); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + } + + [Test] + public void ResolveOpeningQuote() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var expectedResolvedQuotationMark = new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\"test text\"").Build(), + 0, + 1 + ); + var actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveOpeningMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1) + ); + Assert.That(actualResolvedQuotationMark, Is.EqualTo(expectedResolvedQuotationMark)); + Assert.That(basicQuotationMarkResolver.LastQuotationMark, Is.EqualTo(actualResolvedQuotationMark)); + } + + [Test] + public void ResolveClosingQuote() + { + var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var expectedResolvedQuotationMark = new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().SetText("\"test text\"").Build(), + 10, + 11 + ); + var actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveClosingMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 10, 11) + ); + Assert.That(actualResolvedQuotationMark, Is.EqualTo(expectedResolvedQuotationMark)); + } + + public void AssertResolvedQuotationMarksEqual( + List actualResolvedQuotationMarks, + List expectedResolvedQuotationMarks + ) + { + Assert.That(actualResolvedQuotationMarks.Count, Is.EqualTo(expectedResolvedQuotationMarks.Count)); + foreach ( + (QuotationMarkMetadata actualMark, QuotationMarkMetadata expectedMark) in actualResolvedQuotationMarks.Zip( + expectedResolvedQuotationMarks + ) + ) + { + Assert.That(actualMark, Is.EqualTo(expectedMark)); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationConventionDetectorTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs rename to tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationConventionDetectorTests.cs index d54e9412..9891b6af 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationConventionDetectorTests.cs @@ -3,7 +3,7 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis; [TestFixture] -public class QuoteConventionDetectorTests +public class QuotationConventionDetectorTests { // Text comes from the World English Bible, which is in the public domain. [Test] diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs index 4129312a..3a2284ad 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -706,7 +706,7 @@ public void UpdateQuotationMarks() Assert.That(multiCharacterQuotationMarks[3].TextSegment, Is.EqualTo(multiCharacterTextSegment)); QuoteConventionChangingUsfmUpdateBlockHandler singleCharToMultiCharQuoteConventionChanger = - CreateQuoteConventionChangingUsfmUpdateBlockHandler("typewriter_french", "standard_english"); + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "typewriter_french"); TextSegment singleCharacterTextSegment = new TextSegment.Builder() .SetText("this “is ‘a test segment’ ”") @@ -757,7 +757,7 @@ public void UpdateQuotationMarks() Assert.That(singleCharacterQuotationMarks[1].StartIndex, Is.EqualTo(10)); Assert.That(singleCharacterQuotationMarks[1].EndIndex, Is.EqualTo(11)); Assert.That(singleCharacterQuotationMarks[1].TextSegment, Is.EqualTo(singleCharacterTextSegment)); - Assert.That(multiCharacterQuotationMarks[2].StartIndex, Is.EqualTo(25)); + Assert.That(singleCharacterQuotationMarks[2].StartIndex, Is.EqualTo(25)); Assert.That(singleCharacterQuotationMarks[2].EndIndex, Is.EqualTo(26)); Assert.That(singleCharacterQuotationMarks[2].TextSegment, Is.EqualTo(singleCharacterTextSegment)); Assert.That(singleCharacterQuotationMarks[3].StartIndex, Is.EqualTo(27)); From 72d82a9b18c4230f0c902d0d12d0008ef3e99ab7 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 28 Jul 2025 18:44:58 -0400 Subject: [PATCH 13/28] Add regions --- .../PreliminaryQuotationMarkAnalyzerTests.cs | 27 +++++++++++-------- .../QuotationMarkTabulatorTests.cs | 4 --- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs index d683399b..97e23bb2 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs @@ -5,7 +5,7 @@ namespace SIL.Machine.Corpora.PunctuationAnalysis; [TestFixture] public class PreliminaryQuotationMarkAnalyzerTests { - // ApostropheProportionStatistics tests + # region ApostropheProportionStatistics [Test] public void ApostropheProportionStatisticsReset() { @@ -39,10 +39,11 @@ public void IsApostropheProportionGreaterThan() apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("ef").Build()); Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.3)); Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.4)); - - // QuotationMarkWordPosition tests } + #endregion + #region QuotationMarkWordPosition + [Test] public void IsMarkRarelyInitial() { @@ -179,9 +180,9 @@ public void QuotationMarkWordPositionsReset() quotationMarkWordPositions.Reset(); Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("\u201d")); - - // QuotationMarkSequence tests } + #endregion + #region QuotationMarkSequence [Test] public void IsMarkMuchMoreCommonEarlier() @@ -267,10 +268,11 @@ public void IsMarkCommonEarlyAndLate() quotationMarkSequences.CountLaterQuotationMark("\""); quotationMarkSequences.CountLaterQuotationMark("\""); Assert.IsFalse(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); - - // QuotationMarkGrouper tests } + #endregion + #region QuotationMarkGrouper + [Test] public void GetQuotationMarkPairs() { @@ -523,10 +525,11 @@ [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build( new QuoteConventionSet([typewriterEnglishQuoteConvention]) ); Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\"")); - - // PreliminaryApostropheAnalyzer tests } + #endregion + #region PreliminaryApostropheAnalyzer + [Test] public void ThatTheMarkMustBeAnApostrophe() { @@ -890,10 +893,11 @@ [new TextSegment.Builder().SetText("Very short text").Build(),], ] ); Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); - - // PreliminaryQuotationMarkAnalyzer tests } + #endregion + #region PreliminaryQuotationMarkAnalyzer + [Test] public void ThatQuotationMarkSequenceIsUsedToDetermineOpeningAndClosingQuotes() { @@ -1171,4 +1175,5 @@ public void ThatApostrophesNotConsideredAsQuotationMarks() Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) ); } + #endregion } diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs index 74358d23..9c52899b 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs @@ -60,8 +60,6 @@ public void CalculateNumDifferences() Assert.That(counts.CalculateNumDifferences("\""), Is.EqualTo(2)); Assert.That(counts.CalculateNumDifferences("'"), Is.EqualTo(2)); Assert.That(counts.CalculateNumDifferences("\u201c"), Is.EqualTo(4)); - - // QuotationMarkTabulator tests } [Test] @@ -209,7 +207,5 @@ public void CalculateSimilarity() ), Is.EqualTo(0.33333333333333).Within(1e-9) ); - // - // } } From e7bd322a59ba6a9348851f0fba39447125969d60 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 31 Jul 2025 10:30:50 -0400 Subject: [PATCH 14/28] Port Ben's most recent test-related changes --- .../Corpora/QuotationMarkUpdateFirstPass.cs | 25 +++++++------ .../QuotationMarkStringMatchTests.cs | 2 +- .../QuoteConventionTests.cs | 36 +++++++++---------- .../PunctuationAnalysis/TextSegmentTests.cs | 9 +++-- 4 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index dbc2a6fb..e6a776e5 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.Linq; using SIL.Machine.Corpora.PunctuationAnalysis; @@ -34,22 +35,20 @@ public bool CheckWhetherFallbackModeWillWork( QuoteConvention targetQuoteConvention ) { - var targetMarksBySourceMarks = new Dictionary>(); - foreach (int depth in Enumerable.Range(1, sourceQuoteConvention.NumLevels)) + var sourceMarks = new HashSet(); + foreach ( + int depth in Enumerable.Range( + 1, + Math.Min(sourceQuoteConvention.NumLevels, targetQuoteConvention.NumLevels) + 1 + ) + ) { string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtDepth(depth); - if (!targetMarksBySourceMarks.TryGetValue(openingQuotationMark, out HashSet marks)) - { - marks = new HashSet(); - targetMarksBySourceMarks[openingQuotationMark] = marks; - } - if (depth <= targetQuoteConvention.NumLevels) - { - marks.Add(targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth)); - } + if (sourceMarks.Contains(openingQuotationMark)) + return false; + sourceMarks.Add(openingQuotationMark); } - - return !targetMarksBySourceMarks.Keys.Any(sourceMark => targetMarksBySourceMarks[sourceMark].Count > 1); + return true; } public List FindBestChapterStrategies() diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs index d0002f08..d4a1fa3e 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -265,7 +265,7 @@ public void DoesTrailingSubstringMatch() quotationMarkStringMatch = new QuotationMarkStringMatch( new TextSegment.Builder().SetText("sample text").Build(), - 11, + 10, 11 ); Assert.IsFalse(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@".+", RegexOptions.Compiled))); diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs index 53f8c532..52e54785 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs @@ -92,23 +92,23 @@ public void SingleLevelQuoteConventionNormalize() [Test] public void GetNumLevels() { - var emptyQuoteConvention = new QuoteConvention("empty-quote-convention", []); + var emptyQuoteConvention = new QuoteConvention("empty_quote_convention", []); Assert.That(emptyQuoteConvention.NumLevels, Is.EqualTo(0)); var oneLevelQuoteConvention = new QuoteConvention( - "one-level-quote-convention", + "one_level_quote_convention", [new SingleLevelQuoteConvention("\u201c", "\u201d")] ); Assert.That(oneLevelQuoteConvention.NumLevels, Is.EqualTo(1)); var twoLevelQuoteConvention = new QuoteConvention( - "two-level-quote-convention", + "two_level_quote_convention", [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"),] ); Assert.That(twoLevelQuoteConvention.NumLevels, Is.EqualTo(2)); var threeLevelQuoteConvention = new QuoteConvention( - "three-level-quote-convention", + "three_level_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -122,7 +122,7 @@ [new SingleLevelQuoteConvention("\u201c", "\u201d")] public void GetOpeningQuoteAtLevel() { var quoteConvention = new QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -138,7 +138,7 @@ public void GetOpeningQuoteAtLevel() public void GetClosingQuoteAtLevel() { var quoteConvention = new QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -154,7 +154,7 @@ public void GetClosingQuoteAtLevel() public void GetExpectedQuotationMark() { var quoteConvention = new QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -271,7 +271,7 @@ [new SingleLevelQuoteConvention("\u201d", "\u201c")] public void GetPossibleDepths() { var quoteConvention = new QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -295,7 +295,7 @@ public void GetPossibleDepths() public void IsCompatibleWithObservedQuotationMarks() { var quoteConvention = new QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -322,7 +322,7 @@ public void IsCompatibleWithObservedQuotationMarks() quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u201d", "\u201f"]) ); - // must have observed the first-level quotes + // must have observed the first_level quotes Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u2018"], ["\u201d"])); Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u00ab"])); } @@ -330,13 +330,13 @@ public void IsCompatibleWithObservedQuotationMarks() [Test] public void Normalize() { - var emptyQuoteConvention = new QuoteConvention("empty-quote-convention", []); + var emptyQuoteConvention = new QuoteConvention("empty_quote_convention", []); var normalizedEmptyQuoteConvention = emptyQuoteConvention.Normalize(); - Assert.That(normalizedEmptyQuoteConvention.Name, Is.EqualTo("empty-quote-convention_normalized")); + Assert.That(normalizedEmptyQuoteConvention.Name, Is.EqualTo("empty_quote_convention_normalized")); Assert.That(normalizedEmptyQuoteConvention.NumLevels, Is.EqualTo(0)); var standardEnglishQuoteConvention = new QuoteConvention( - "standard-english-quote-convention", + "standard_english_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -347,7 +347,7 @@ public void Normalize() var normalizedStandardEnglishQuoteConvention = standardEnglishQuoteConvention.Normalize(); Assert.That( normalizedStandardEnglishQuoteConvention.Name, - Is.EqualTo("standard-english-quote-convention_normalized") + Is.EqualTo("standard_english_quote_convention_normalized") ); Assert.That(normalizedStandardEnglishQuoteConvention.NumLevels, Is.EqualTo(4)); Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); @@ -360,7 +360,7 @@ public void Normalize() Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(4), Is.EqualTo("'")); var westernEuropeanQuoteConvention = new QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u00ab", "\u00bb"), @@ -368,7 +368,7 @@ public void Normalize() ] ); var normalizedWesternEuropeanQuoteConvention = westernEuropeanQuoteConvention.Normalize(); - Assert.That(normalizedWesternEuropeanQuoteConvention.Name, Is.EqualTo("test-quote-convention_normalized")); + Assert.That(normalizedWesternEuropeanQuoteConvention.Name, Is.EqualTo("test_quote_convention_normalized")); Assert.That(normalizedWesternEuropeanQuoteConvention.NumLevels, Is.EqualTo(3)); Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\"")); @@ -378,7 +378,7 @@ public void Normalize() Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("'")); var hybridBritishTypewriterEnglishQuoteConvention = new QuoteConvention( - "hybrid-british-typewriter-english-quote-convention", + "hybrid_british_typewriter_english_quote_convention", [ new SingleLevelQuoteConvention("\u00ab", "\u00bb"), new SingleLevelQuoteConvention("'", "'"), @@ -391,7 +391,7 @@ public void Normalize() ); Assert.IsTrue( normalizedHybridBritishTypewriterEnglishQuoteConvention.Name - == "hybrid-british-typewriter-english-quote-convention_normalized" + == "hybrid_british_typewriter_english_quote_convention_normalized" ); Assert.That(normalizedHybridBritishTypewriterEnglishQuoteConvention.NumLevels, Is.EqualTo(3)); Assert.That( diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs index d96d8cbd..d1a6362c 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs @@ -91,9 +91,12 @@ public void Equals() var basicSegment = new TextSegment.Builder().SetText("text1").Build(); var sameTextSegment = new TextSegment.Builder().SetText("text1").Build(); var differentTextSegment = new TextSegment.Builder().SetText("different text").Build(); - - // Assert.That(basicSegment, Is.EqualTo(basicSegment)); - // Assert.That(basicSegment , Is.Not.EqualTo(new UsfmToken("text1"))); +#pragma warning disable NUnit2009 // The same value has been provided as both the actual and the expected argument + Assert.That(basicSegment, Is.EqualTo(basicSegment)); +#pragma warning restore NUnit2009 // The same value has been provided as both the actual and the expected argument +#pragma warning disable NUnit2021 // Incompatible types for EqualTo constraint + Assert.That(basicSegment, Is.Not.EqualTo(new UsfmToken("text1"))); +#pragma warning restore NUnit2021 // Incompatible types for EqualTo constraint Assert.That(basicSegment, Is.EqualTo(sameTextSegment)); Assert.That(basicSegment, Is.Not.EqualTo(differentTextSegment)); From 832cd6ac86b6ee9c1ec441d15a9dcdfeed1bc850 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 31 Jul 2025 11:11:21 -0400 Subject: [PATCH 15/28] Fix typo --- .../Corpora/QuotationMarkUpdateFirstPass.cs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index e6a776e5..49d55e52 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -35,18 +35,27 @@ public bool CheckWhetherFallbackModeWillWork( QuoteConvention targetQuoteConvention ) { - var sourceMarks = new HashSet(); + var targetMarkBySourceMark = new Dictionary(); foreach ( int depth in Enumerable.Range( 1, - Math.Min(sourceQuoteConvention.NumLevels, targetQuoteConvention.NumLevels) + 1 + Math.Min(sourceQuoteConvention.NumLevels, targetQuoteConvention.NumLevels) ) ) { string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtDepth(depth); - if (sourceMarks.Contains(openingQuotationMark)) + string closingQuotationMark = targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth); + if ( + targetMarkBySourceMark.TryGetValue( + openingQuotationMark, + out string correspondingClosingQuotationMark + ) + && correspondingClosingQuotationMark != closingQuotationMark + ) + { return false; - sourceMarks.Add(openingQuotationMark); + } + targetMarkBySourceMark[openingQuotationMark] = closingQuotationMark; } return true; } From 837488d5208e61c45dbb2f182c06bbf052acb955 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Sat, 2 Aug 2025 12:41:05 -0400 Subject: [PATCH 16/28] Add paratext zip quotation convention detector --- .../ParatextProjectQuoteConventionDetector.cs | 56 +++++++++++++ .../Corpora/ParatextProjectSettings.cs | 13 ++- .../QuotationMarkFinder.cs | 4 +- ...pParatextProjectQuoteConventionDetector.cs | 29 +++++++ ...ryParatextProjectQuoteConvetionDetector.cs | 23 ++++++ ...atextProjectQuoteConvetionDetectorTests.cs | 82 +++++++++++++++++++ 6 files changed, 204 insertions(+), 3 deletions(-) create mode 100644 src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs create mode 100644 src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs diff --git a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs new file mode 100644 index 00000000..5f24c6a0 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs @@ -0,0 +1,56 @@ +using System; +using System.IO; +using System.Text; +using SIL.Machine.Corpora.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + public abstract class ParatextProjectQuoteConventionDetector + { + private readonly ParatextProjectSettings _settings; + + protected ParatextProjectQuoteConventionDetector(ParatextProjectSettings settings) + { + _settings = settings; + } + + protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBase settingsParser) + { + _settings = settingsParser.Parse(); + } + + public QuoteConventionAnalysis GetQuoteConventionAnalysis() + { + var handler = new QuoteConventionDetector(); + foreach (string fileName in _settings.GetAllBookFileNames()) + { + if (!Exists(fileName)) + continue; + + string usfm; + using (var reader = new StreamReader(Open(fileName))) + { + usfm = reader.ReadToEnd(); + } + + try + { + UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + } + catch (Exception ex) + { + var sb = new StringBuilder(); + sb.Append($"An error occurred while parsing the usfm for '{fileName}`"); + if (!string.IsNullOrEmpty(_settings.Name)) + sb.Append($" in project '{_settings.Name}'"); + sb.Append($". Error: '{ex.Message}'"); + throw new InvalidOperationException(sb.ToString(), ex); + } + } + return handler.DetectQuotationConvention(); + } + + protected abstract bool Exists(string fileName); + protected abstract Stream Open(string fileName); + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs index 6781a8ad..68ab4179 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs @@ -1,4 +1,5 @@ -using System.Globalization; +using System.Collections.Generic; +using System.Globalization; using System.Text; using SIL.Scripture; @@ -103,6 +104,16 @@ public string GetBookFileName(string bookId) return FileNamePrefix + bookPart + FileNameSuffix; } + public IEnumerable GetAllBookFileNames() + { + BookSet scriptureBooks = Canon.ScriptureBooks; + scriptureBooks.SelectAll(); + foreach (string bookId in scriptureBooks.SelectedBookIds) + { + yield return GetBookFileName(bookId); + } + } + private static string GetBookFileNameDigits(string bookId) { int bookNum = Canon.BookIdToNumber(bookId); diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs index e4ddd74b..85d89d19 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs @@ -49,8 +49,8 @@ public List FindAllPotentialQuotationMarksInTextSegmen || QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "APOSTROPHE") ) && ( - _quoteConventions.IsValidOpeningQuotationMark(charactersEnumerator.Current.ToString()) - || _quoteConventions.IsValidClosingQuotationMark(charactersEnumerator.Current.ToString()) + _quoteConventions.IsValidOpeningQuotationMark(currentCharacterString) + || _quoteConventions.IsValidClosingQuotationMark(currentCharacterString) ) ) { diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs new file mode 100644 index 00000000..91736056 --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs @@ -0,0 +1,29 @@ +using System.IO; +using System.IO.Compression; + +namespace SIL.Machine.Corpora +{ + public class ZipParatextProjectQuoteConventionDetector : ParatextProjectQuoteConventionDetector + { + private readonly ZipArchive _archive; + + public ZipParatextProjectQuoteConventionDetector(ZipArchive archive) + : base(new ZipParatextProjectSettingsParser(archive)) + { + _archive = archive; + } + + protected override bool Exists(string fileName) + { + return _archive.GetEntry(fileName) != null; + } + + protected override Stream Open(string fileName) + { + ZipArchiveEntry entry = _archive.GetEntry(fileName); + if (entry == null) + return null; + return entry.Open(); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs b/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs new file mode 100644 index 00000000..01d959d8 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs @@ -0,0 +1,23 @@ +using System.Text; + +namespace SIL.Machine.Corpora; + +public class MemoryParatextProjectQuoteConventionDetector( + ParatextProjectSettings settings, + IDictionary files +) : ParatextProjectQuoteConventionDetector(settings) +{ + public IDictionary Files { get; } = files; + + protected override bool Exists(string fileName) + { + return Files.ContainsKey(fileName); + } + + protected override Stream? Open(string fileName) + { + if (!Files.TryGetValue(fileName, out string? contents)) + return null; + return new MemoryStream(Encoding.UTF8.GetBytes(contents)); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs new file mode 100644 index 00000000..09fff927 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs @@ -0,0 +1,82 @@ +using System.Text; +using NUnit.Framework; +using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Scripture; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class ParatextProjectQuoteConventionDetectorTests +{ + [Test] + public void TestGetQuotationAnalysis() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + @"\id MAT +\c 1 +\v 1 Someone said, “This is something I am saying! +\v 2 This is also something I am saying” (that is, “something I am speaking”). +\p +\v 3 Other text, and someone else said, +\q1 +\v 4 “Things +\q2 someone else said! +\q3 and more things someone else said.” +\m That is why he said “things someone else said.” +\v 5 Then someone said, “More things someone said.”" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention(); + Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8)); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + } + + private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary? files = null) + { + public ParatextProjectQuoteConventionDetector Detector { get; } = + new MemoryParatextProjectQuoteConventionDetector( + settings ?? new DefaultParatextProjectSettings(), + files ?? new() + ); + + public QuoteConventionAnalysis GetQuoteConvention() + { + return Detector.GetQuoteConventionAnalysis(); + } + } + + private class DefaultParatextProjectSettings( + string name = "Test", + string fullName = "TestProject", + Encoding? encoding = null, + ScrVers? versification = null, + UsfmStylesheet? stylesheet = null, + string fileNamePrefix = "", + string fileNameForm = "41MAT", + string fileNameSuffix = "Test.SFM", + string biblicalTermsListType = "Project", + string biblicalTermsProjectName = "Test", + string biblicalTermsFileName = "ProjectBiblicalTerms.xml", + string languageCode = "en" + ) + : ParatextProjectSettings( + name, + fullName, + encoding ?? Encoding.UTF8, + versification ?? ScrVers.English, + stylesheet ?? new UsfmStylesheet("usfm.sty"), + fileNamePrefix, + fileNameForm, + fileNameSuffix, + biblicalTermsListType, + biblicalTermsProjectName, + biblicalTermsFileName, + languageCode + ) { } +} From fb2ce792822ee2bb211a41c898471477379a17fc Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 4 Aug 2025 11:32:49 -0400 Subject: [PATCH 17/28] Make convention detector operable on multiple zips --- .../Corpora/ParatextProjectQuoteConventionDetector.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs index 5f24c6a0..a907aaa2 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs @@ -19,9 +19,9 @@ protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBa _settings = settingsParser.Parse(); } - public QuoteConventionAnalysis GetQuoteConventionAnalysis() + public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null) { - var handler = new QuoteConventionDetector(); + handler = handler ?? new QuoteConventionDetector(); foreach (string fileName in _settings.GetAllBookFileNames()) { if (!Exists(fileName)) From b0f6d9286b3708c8104ef7b85a3fe658561fea40 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 5 Aug 2025 16:00:09 -0400 Subject: [PATCH 18/28] Rename function --- .../Corpora/ParatextProjectQuoteConventionDetector.cs | 2 +- src/SIL.Machine/Corpora/ParatextProjectSettings.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs index a907aaa2..786352a0 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs @@ -22,7 +22,7 @@ protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBa public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null) { handler = handler ?? new QuoteConventionDetector(); - foreach (string fileName in _settings.GetAllBookFileNames()) + foreach (string fileName in _settings.GetAllScriptureBookFileNames()) { if (!Exists(fileName)) continue; diff --git a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs index 68ab4179..286c6c27 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs @@ -104,7 +104,7 @@ public string GetBookFileName(string bookId) return FileNamePrefix + bookPart + FileNameSuffix; } - public IEnumerable GetAllBookFileNames() + public IEnumerable GetAllScriptureBookFileNames() { BookSet scriptureBooks = Canon.ScriptureBooks; scriptureBooks.SelectAll(); From 27d8573809b5ec7af2a847cb96a4cd8596d3eb81 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 5 Aug 2025 16:53:19 -0400 Subject: [PATCH 19/28] Fix remark adding when textBehavior is PreferExisting --- .../Corpora/UpdateUsfmParserHandler.cs | 32 ++++++++++---- .../Corpora/UpdateUsfmParserHandlerTests.cs | 42 +++++++++++++++++-- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 0da338ef..21c0f6fd 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -89,14 +89,6 @@ public override void StartBook(UsfmParserState state, string marker, string code var startBookTokens = new List(); if (_idText != null) startBookTokens.Add(new UsfmToken(_idText + " ")); - if (_remarks.Count() > 0) - { - foreach (string remark in _remarks) - { - startBookTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); - startBookTokens.Add(new UsfmToken(remark)); - } - } PushUpdatedText(startBookTokens); base.StartBook(state, marker, code); @@ -366,7 +358,29 @@ public string GetUsfm(string stylesheetFileName = "usfm.sty") public string GetUsfm(UsfmStylesheet stylesheet) { var tokenizer = new UsfmTokenizer(stylesheet); - return tokenizer.Detokenize(_tokens); + List tokens = new List(_tokens); + if (_remarks.Count() > 0) + { + var remarkTokens = new List(); + foreach (string remark in _remarks) + { + remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); + remarkTokens.Add(new UsfmToken(remark)); + } + + if (tokens.Count > 0 && tokens[0].Marker == "id") + { + if (tokens.Count > 1 && tokens[1].Type == UsfmTokenType.Text) + { + tokens.InsertRange(2, remarkTokens); + } + else + { + tokens.InsertRange(1, remarkTokens); + } + } + } + return tokenizer.Detokenize(tokens); } private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs) diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 9430621b..0372126b 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -1224,6 +1224,39 @@ public void GetUsfm_HeaderReferenceParagraphs() AssertUsfmEquals(target, resultP); } + [Test] + public void GetUsfm_PreferExisting_AddRemark() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:1"), "Update 1"), + (ScrRef("MAT 1:2"), "Update 2"), + }; + var usfm = + @"\id MAT - Test +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +"; + string target = UpdateUsfm( + rows, + usfm, + textBehavior: UpdateUsfmTextBehavior.PreferExisting, + remarks: ["New remark"] + ); + var result = + @"\id MAT - Test +\rem New remark +\c 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text +"; + + AssertUsfmEquals(target, result); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); @@ -1238,7 +1271,8 @@ private static string UpdateUsfm( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable? preserveParagraphStyles = null, - IEnumerable? usfmUpdateBlockHandlers = null + IEnumerable? usfmUpdateBlockHandlers = null, + IEnumerable? remarks = null ) { if (source is null) @@ -1253,7 +1287,8 @@ private static string UpdateUsfm( embedBehavior, styleBehavior, preserveParagraphStyles, - usfmUpdateBlockHandlers + usfmUpdateBlockHandlers, + remarks ); } else @@ -1267,7 +1302,8 @@ private static string UpdateUsfm( embedBehavior, styleBehavior, preserveParagraphStyles, - usfmUpdateBlockHandlers + usfmUpdateBlockHandlers, + remarks ); UsfmParser.Parse(source, updater); return updater.GetUsfm(); From 0b8c14b8b183c83df4c1136c52821c14d4a98698 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 6 Aug 2025 10:49:07 -0400 Subject: [PATCH 20/28] Port add metadata to update block and marker behavior metadata --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 2 +- .../PlaceMarkersUsfmUpdateBlockHandler.cs | 49 +- .../Corpora/UpdateUsfmParserHandler.cs | 45 +- src/SIL.Machine/Corpora/UsfmUpdateBlock.cs | 18 +- ...PlaceMarkersUsfmUpdateBlockHandlerTests.cs | 533 ++++++++++++------ .../Corpora/UpdateUsfmParserHandlerTests.cs | 231 ++++---- .../Corpora/UsfmManualTests.cs | 24 +- 7 files changed, 560 insertions(+), 342 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 02cf07e3..65273298 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -21,7 +21,7 @@ protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase setti public string UpdateUsfm( string bookId, - IReadOnlyList<(IReadOnlyList, string)> rows, + IReadOnlyList rows, string fullName = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 93446437..4cfbeedb 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -8,49 +8,66 @@ namespace SIL.Machine.Corpora { public class PlaceMarkersAlignmentInfo { - public IReadOnlyList Refs { get; } + public const string MetadataKey = "alignment_info"; + public IReadOnlyList SourceTokens { get; } public IReadOnlyList TranslationTokens { get; } public WordAlignmentMatrix Alignment { get; } + public UpdateUsfmMarkerBehavior ParagraphBehavior { get; } + public UpdateUsfmMarkerBehavior StyleBehavior { get; } public PlaceMarkersAlignmentInfo( - IReadOnlyList refs, IReadOnlyList sourceTokens, IReadOnlyList translationTokens, - WordAlignmentMatrix alignment + WordAlignmentMatrix alignment, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior styleBehavior ) { - Refs = refs; SourceTokens = sourceTokens; TranslationTokens = translationTokens; Alignment = alignment; + ParagraphBehavior = paragraphBehavior; + StyleBehavior = styleBehavior; } } public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler { - private readonly IDictionary _alignmentInfo; - - public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable alignmentInfo) - { - _alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info); - } - public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { string reference = block.Refs.FirstOrDefault().ToString(); var elements = block.Elements.ToList(); // Nothing to do if there are no markers to place or no alignment to use + if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject)) + { + return block; + } + if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo)) + { + return block; + } if ( elements.Count == 0 - || !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo) || alignmentInfo.Alignment.RowCount == 0 || alignmentInfo.Alignment.ColumnCount == 0 || !elements.Any(e => +<<<<<<< HEAD e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) && !e.MarkedForRemoval && e.Tokens.Count == 1 +======= + ( + e.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve + && e.Tokens.Count == 1 + ) + || ( + e.Type == UsfmUpdateBlockElementType.Style + && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve + ) +>>>>>>> 55d91e41 (Port add metadata to update block and marker behavior metadata) ) ) { @@ -112,7 +129,13 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { if (element.Type == UsfmUpdateBlockElementType.Text) { - if (element.MarkedForRemoval) + if ( + element.MarkedForRemoval + || ( + element.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip + ) + ) { string text = element.Tokens[0].ToUsfm(); sourceSentence += text; diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 21c0f6fd..6883b10d 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -17,13 +17,31 @@ public enum UpdateUsfmMarkerBehavior Strip, } + public class UpdateUsfmRow + { + public IReadOnlyList Refs { get; } + public string Text { get; } + public IReadOnlyDictionary Metadata { get; } + + public UpdateUsfmRow( + IReadOnlyList refs, + string text, + IReadOnlyDictionary metadata = null + ) + { + Refs = refs; + Text = text; + Metadata = metadata ?? new Dictionary(); + } + } + /*** * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified * text. */ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase { - private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; + private readonly IReadOnlyList _rows; private readonly List _tokens; private readonly List _updatedText; private readonly List _embedTokens; @@ -41,7 +59,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private int _tokenIndex; public UpdateUsfmParserHandler( - IReadOnlyList<(IReadOnlyList, string)> rows = null, + IReadOnlyList rows = null, string idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, @@ -52,7 +70,7 @@ public UpdateUsfmParserHandler( IEnumerable remarks = null ) { - _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); + _rows = rows ?? Array.Empty(); _tokens = new List(); _updatedText = new List(); _updateBlocks = new Stack(); @@ -383,16 +401,24 @@ public string GetUsfm(UsfmStylesheet stylesheet) return tokenizer.Detokenize(tokens); } - private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs) + private (IReadOnlyList RowTexts, Dictionary Metadata) AdvanceRows( + IReadOnlyList segScrRefs + ) { var rowTexts = new List(); + Dictionary rowMetadata = null; int sourceIndex = 0; // search the sorted rows with updated text, starting from where we left off last. while (_rowIndex < _rows.Count && sourceIndex < segScrRefs.Count) { // get the set of references for the current row int compare = 0; - (IReadOnlyList rowScrRefs, string text) = _rows[_rowIndex]; + UpdateUsfmRow row = _rows[_rowIndex]; + (IReadOnlyList rowScrRefs, string text, IReadOnlyDictionary metadata) = ( + row.Refs, + row.Text, + row.Metadata + ); foreach (ScriptureRef rowScrRef in rowScrRefs) { while (sourceIndex < segScrRefs.Count) @@ -409,6 +435,7 @@ private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs // source and row match // grab the text - both source and row will be incremented in due time... rowTexts.Add(text); + rowMetadata = metadata.ToDictionary(kvp => kvp.Key, kvp => kvp.Value); break; } } @@ -418,7 +445,7 @@ private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs _rowIndex++; } } - return rowTexts; + return (rowTexts, rowMetadata); } private void CollectUpdatableTokens(UsfmParserState state) @@ -522,8 +549,10 @@ private bool HasNewText() private void StartUpdateBlock(IReadOnlyList scriptureRefs) { - _updateBlocks.Push(new UsfmUpdateBlock(scriptureRefs)); - IReadOnlyList rowTexts = AdvanceRows(scriptureRefs); + (IReadOnlyList rowTexts, Dictionary metadata) = AdvanceRows(scriptureRefs); + _updateBlocks.Push( + new UsfmUpdateBlock(scriptureRefs, metadata: metadata ?? new Dictionary()) + ); PushUpdatedText(rowTexts.Select(t => new UsfmToken(t + " "))); } diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs index 6640e96a..22140729 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs @@ -13,17 +13,24 @@ public IReadOnlyList Elements { get => _elements; } + public IReadOnlyDictionary Metadata + { + get => _metadata; + } private readonly List _refs; private readonly List _elements; + private readonly Dictionary _metadata; public UsfmUpdateBlock( IEnumerable refs = null, - IEnumerable elements = null + IEnumerable elements = null, + Dictionary metadata = null ) { - _refs = refs != null ? refs.ToList() : new List(); - _elements = elements != null ? elements.ToList() : new List(); + _refs = refs?.ToList() ?? new List(); + _elements = elements?.ToList() ?? new List(); + _metadata = metadata ?? new Dictionary(); } public void AddText(IEnumerable tokens) @@ -100,7 +107,10 @@ public override bool Equals(object obj) UsfmUpdateBlock other = (UsfmUpdateBlock)obj; - return _refs.SequenceEqual(other._refs) && _elements.SequenceEqual(other._elements); + return _refs.SequenceEqual(other._refs) + && _elements.SequenceEqual(other._elements) + && _metadata.Count == other.Metadata.Count + && !_metadata.Except(other.Metadata).Any(); } public override int GetHashCode() diff --git a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs index 7c17e7cf..9769e475 100644 --- a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs @@ -15,7 +15,23 @@ public void UpdateUsfm_ParagraphMarkers() string source = "This is the first paragraph. This text is in English, and this test is for paragraph markers."; string pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo."; - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), pretranslation)]; + PlaceMarkersAlignmentInfo alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ); + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + pretranslation, + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; string usfm = @"\id MAT \c 1 @@ -23,23 +39,12 @@ public void UpdateUsfm_ParagraphMarkers() \p This text is in English, \p and this test is for paragraph markers. "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: Tokenizer.Tokenize(source).ToList(), - translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), - alignment: ToWordAlignmentMatrix( - "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" - ) - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -59,29 +64,34 @@ public void UpdateUsfm_StyleMarkers() string source = "This is the first sentence. This text is in English, and this test is for style markers."; string pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo."; - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), pretranslation)]; + PlaceMarkersAlignmentInfo alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ); + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + pretranslation, + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; string usfm = @"\id MAT \c 1 \v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: Tokenizer.Tokenize(source).ToList(), - translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), - alignment: ToWordAlignmentMatrix( - "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" - ) - ) - ]; string target = UpdateUsfm( rows, usfm, styleBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -92,11 +102,29 @@ public void UpdateUsfm_StyleMarkers() AssertUsfmEquals(target, result); + alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ); + rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + pretranslation, + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; + target = UpdateUsfm( rows, usfm, styleBehavior: UpdateUsfmMarkerBehavior.Strip, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -112,16 +140,16 @@ public void UpdateUsfm_StyleMarkers() [Test] public void UpdateUsfm_EmbedMarkers() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "New verse 1"), - (ScrRef("MAT 1:2"), "New verse 2"), - (ScrRef("MAT 1:3"), "New verse 3"), - (ScrRef("MAT 1:4"), "New verse 4"), - (ScrRef("MAT 1:4/1:f"), "New embed text"), - (ScrRef("MAT 1:5"), "New verse 5"), - (ScrRef("MAT 1:6"), "New verse 6"), - (ScrRef("MAT 1:6/1:f"), "New verse 6 embed text") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "New verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "New verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "New verse 3"), + new UpdateUsfmRow(ScrRef("MAT 1:4"), "New verse 4"), + new UpdateUsfmRow(ScrRef("MAT 1:4/1:f"), "New embed text"), + new UpdateUsfmRow(ScrRef("MAT 1:5"), "New verse 5"), + new UpdateUsfmRow(ScrRef("MAT 1:6"), "New verse 6"), + new UpdateUsfmRow(ScrRef("MAT 1:6/1:f"), "New verse 6 embed text") ]; string usfm = @"\id MAT @@ -133,13 +161,12 @@ public void UpdateUsfm_EmbedMarkers() \v 5 Embed with style markers \f \fr 1.5 \ft A \+w stylish\+w* note \f* \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* "; - IReadOnlyList alignInfo = []; string target = UpdateUsfm( rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -159,7 +186,7 @@ public void UpdateUsfm_EmbedMarkers() rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -179,7 +206,21 @@ public void UpdateUsfm_EmbedMarkers() [Test] public void UpdateUsfm_TrailingEmptyParagraphs() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "New verse 1")]; + PlaceMarkersAlignmentInfo alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: ["Verse", "1"], + translationTokens: ["New", "verse", "1"], + alignment: ToWordAlignmentMatrix("0-1 1-2"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ); + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New verse 1", + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; string usfm = @"\id MAT \c 1 @@ -188,21 +229,12 @@ public void UpdateUsfm_TrailingEmptyParagraphs() \b \q1 \f embed 2 \f* "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["Verse", "1"], - translationTokens: ["New", "verse", "1"], - alignment: ToWordAlignmentMatrix("0-1 1-2") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -219,12 +251,44 @@ public void UpdateUsfm_TrailingEmptyParagraphs() [Test] public void UpdateUsfm_Headers() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "X Y Z"), - (ScrRef("MAT 1:2"), "X"), - (ScrRef("MAT 1:3"), "Y"), - (ScrRef("MAT 1:3/1:s1"), "Updated header") + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "X Y Z", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["A", "B", "C"], + translationTokens: ["X", "Y", "Z"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + new UpdateUsfmRow( + ScrRef("MAT 1:2"), + "X", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["A"], + translationTokens: ["X"], + alignment: ToWordAlignmentMatrix("0-0"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "Y"), + new UpdateUsfmRow(ScrRef("MAT 1:3/1:s1"), "Updated header") ]; string usfm = @"\id MAT @@ -248,27 +312,12 @@ public void UpdateUsfm_Headers() \v 3 B \s1 Header to be updated "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["A", "B", "C"], - translationTokens: ["X", "Y", "Z"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2") - ), - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:2"], - sourceTokens: ["A"], - translationTokens: ["X"], - alignment: ToWordAlignmentMatrix("0-0") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -300,29 +349,39 @@ public void UpdateUsfm_Headers() [Test] public void UpdateUsfm_ConsecutiveMarkers() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "New verse 1 WORD"),]; + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New verse 1 WORD", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["Old", "verse", "1", "word"], + translationTokens: ["New", "verse", "1", "WORD"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ) + } + } + ), + ]; string usfm = @"\id MAT \c 1 \v 1 Old verse 1 \p \qt \+w word\+w*\qt* "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["Old", "verse", "1", "word"], - translationTokens: ["New", "verse", "1", "WORD"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, styleBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -338,34 +397,38 @@ public void UpdateUsfm_ConsecutiveMarkers() [Test] public void UpdateUsfm_VerseRanges() { - IReadOnlyList<(IReadOnlyList, string)> rows = - [ - ( - Enumerable.Range(1, 6).Select(i => ScriptureRef.Parse($"MAT 1:{i}")).ToList(), - "New verse range text new paragraph 2" - ) - ]; + IReadOnlyList rows = Enumerable + .Range(1, 6) + .Select(i => new UpdateUsfmRow( + [ScriptureRef.Parse($"MAT 1:{i}")], + "New verse range text new paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["Verse", "range", "old", "paragraph", "2"], + translationTokens: ["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5 4-6"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + )) + .ToList(); string usfm = @"\id MAT \c 1 \v 1-5 Verse range \p old paragraph 2 "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: Enumerable.Range(1, 6).Select(i => ScriptureRef.Parse($"MAT 1:{i}").ToString()).ToList(), - sourceTokens: ["Verse", "range", "old", "paragraph", "2"], - translationTokens: ["New", "verse", "range", "text", "new", "paragraph", "2"], - alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5 4-6") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -381,9 +444,26 @@ public void UpdateUsfm_VerseRanges() [Test] public void UpdateUsfm_NoUpdate() { - IReadOnlyList<(IReadOnlyList, string)> rows = + //Strip paragraphs + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "New paragraph 1 New paragraph 2"), + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New paragraph 1 New paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["Old", "paragraph", "1", "Old", "paragraph", "2"], + translationTokens: ["New", "paragraph", "1", "New", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), ]; string usfm = @"\id MAT @@ -392,22 +472,11 @@ public void UpdateUsfm_NoUpdate() \p Old paragraph 2 "; - //Strip paragraphs - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["Old", "paragraph", "1", "Old", "paragraph", "2"], - translationTokens: ["New", "paragraph", "1", "New", "paragraph", "2"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -419,21 +488,32 @@ public void UpdateUsfm_NoUpdate() AssertUsfmEquals(target, result); //No alignment - alignInfo = + rows = [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: [], - translationTokens: [], - alignment: ToWordAlignmentMatrix("") - ) + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New paragraph 1 New paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: [], + translationTokens: [], + alignment: ToWordAlignmentMatrix(""), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), ]; target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -447,12 +527,11 @@ public void UpdateUsfm_NoUpdate() // No text update rows = []; - alignInfo = []; target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -467,9 +546,25 @@ public void UpdateUsfm_NoUpdate() [Test] public void UpdateUsfm_SplitTokens() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "words split words split words split"), + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "words split words split words split", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["words", "split", "words", "split", "words", "split"], + translationTokens: ["words", "split", "words", "split", "words", "split"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), ]; string usfm = @"\id MAT @@ -479,21 +574,11 @@ public void UpdateUsfm_SplitTokens() \p it words split "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["words", "split", "words", "split", "words", "split"], - translationTokens: ["words", "split", "words", "split", "words", "split"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -510,29 +595,38 @@ public void UpdateUsfm_SplitTokens() [Test] public void UpdateUsfm_NoText() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), ""),]; + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: [], + translationTokens: [], + alignment: ToWordAlignmentMatrix(""), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ) + } + } + ), + ]; string usfm = @"\id MAT \c 1 \v 1 \w \w* "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: [], - translationTokens: [], - alignment: ToWordAlignmentMatrix("") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, styleBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -547,7 +641,26 @@ public void UpdateUsfm_NoText() [Test] public void UpdateUsfm_ConsecutiveSubstring() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "string ring"),]; + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "string ring", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["string", "ring"], + translationTokens: ["string", "ring"], + alignment: ToWordAlignmentMatrix("0-0 1-1"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + ]; string usfm = @"\id MAT \c 1 @@ -555,21 +668,11 @@ public void UpdateUsfm_ConsecutiveSubstring() \p ring "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["string", "ring"], - translationTokens: ["string", "ring"], - alignment: ToWordAlignmentMatrix("0-0 1-1") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -585,10 +688,42 @@ public void UpdateUsfm_ConsecutiveSubstring() [Test] public void UpdateUsfm_VersesOutOfOrder() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "new verse 1 new paragraph 2"), - (ScrRef("MAT 1:2"), "new verse 2") + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "new verse 1 new paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["verse", "1", "paragraph", "2"], + translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + new UpdateUsfmRow( + ScrRef("MAT 1:2"), + "new verse 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["verse", "2"], + translationTokens: ["new", "verse", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ) ]; string usfm = @"\id MAT @@ -598,27 +733,13 @@ public void UpdateUsfm_VersesOutOfOrder() \p paragraph 2 "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["verse", "1", "paragraph", "2"], - translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], - alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5") - ), - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:2"], - sourceTokens: ["verse", "2"], - translationTokens: ["new", "verse", "2"], - alignment: ToWordAlignmentMatrix("0-1 1-2") - ) - ]; + IReadOnlyList alignInfo = []; string target = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.StripExisting, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -632,6 +753,58 @@ public void UpdateUsfm_VersesOutOfOrder() AssertUsfmEquals(target, result); } + [Test] + public void UpdateUsfm_StripParagraphsWithHeaders() + { + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "new verse 1 new paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["verse", "1", "paragraph", "2"], + translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ) + } + } + ), + ]; + string usfm = + @"\id MAT +\c 1 +\v 1 verse 1 +\s header +\p paragraph 2 +\v 2 verse 2 +"; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] + ); + + string result = + @"\id MAT +\c 1 +\v 1 new verse 1 new paragraph 2 +\s header +\p +\v 2 verse 2 +"; + + AssertUsfmEquals(target, result); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); @@ -653,7 +826,7 @@ private static WordAlignmentMatrix ToWordAlignmentMatrix(string alignment) } private static string UpdateUsfm( - IReadOnlyList<(IReadOnlyList, string)> rows, + IReadOnlyList rows, string source, string? idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 0372126b..4a9407f5 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -9,9 +9,9 @@ public class UpdateUsfmParserHandlerTests [Test] public void GetUsfm_Verse_CharStyle() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "First verse of the first chapter.") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "First verse of the first chapter.") }; string target = UpdateUsfm(rows); @@ -34,10 +34,10 @@ public void GetUsfm_IdText() [Test] public void GetUsfm_StripAllText() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update 1"), - (ScrRef("MAT 1:3"), "Update 3") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "Update 3") }; var usfm = @"\id MAT - Test @@ -103,11 +103,11 @@ public void GetUsfm_StripAllText() [Test] public void GetUsfm_StripParagraphs_PreserveParagraphStyles() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/1:rem"), "New remark"), - (ScrRef("MAT 1:0/3:ip"), "Another new remark"), - (ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:0/1:rem"), "New remark"), + new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "Another new remark"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), }; string usfm = @"\id MAT @@ -157,10 +157,10 @@ public void GetUsfm_StripParagraphs_PreserveParagraphStyles() [Test] public void GetUsfm_PreserveParagraphs() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/1:rem"), "Update remark"), - (ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:0/1:rem"), "Update remark"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), }; string usfm = @"\id MAT @@ -204,7 +204,7 @@ public void GetUsfm_PreserveParagraphs() [Test] public void GetUsfm_ParagraphInVerse() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1"), }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), }; string usfm = @"\id MAT - Test \c 1 @@ -248,10 +248,10 @@ public void GetUsfm_ParagraphInVerse() [Test] public void GetUsfm_PreferExisting() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update 1"), - (ScrRef("MAT 1:2"), "Update 2"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), }; var usfm = @"\id MAT - Test @@ -274,10 +274,10 @@ public void GetUsfm_PreferExisting() [Test] public void GetUsfm_PreferRows() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:6"), "Text 6"), - (ScrRef("MAT 1:7"), "Text 7"), + new UpdateUsfmRow(ScrRef("MAT 1:6"), "Text 6"), + new UpdateUsfmRow(ScrRef("MAT 1:7"), "Text 7"), }; string target = UpdateUsfm(rows, textBehavior: UpdateUsfmTextBehavior.PreferNew); Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); @@ -288,9 +288,9 @@ public void GetUsfm_PreferRows() [Test] public void GetUsfm_Verse_StripNote() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:1"), "First verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:1"), "First verse of the second chapter.") }; string target = UpdateUsfm(rows, embedBehavior: UpdateUsfmMarkerBehavior.Strip); @@ -300,7 +300,7 @@ public void GetUsfm_Verse_StripNote() [Test] public void GetUsfm_Verse_ReplaceWithNote() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "updated text") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "updated text") }; var usfm = @"\id MAT - Test \c 1 @@ -318,9 +318,9 @@ public void GetUsfm_Verse_ReplaceWithNote() [Test] public void GetUsfm_Verse_RowVerseSegment() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:1a"), "First verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:1a"), "First verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -335,9 +335,9 @@ public void GetUsfm_Verse_RowVerseSegment() [Test] public void GetUsfm_Verse_UsfmVerseSegment() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:7"), "Seventh verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:7"), "Seventh verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -347,9 +347,9 @@ public void GetUsfm_Verse_UsfmVerseSegment() [Test] public void GetUsfm_Verse_MultipleParas() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:2"), "Second verse of the first chapter.") + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Second verse of the first chapter.") }; string target = UpdateUsfm(rows); @@ -364,9 +364,9 @@ public void GetUsfm_Verse_MultipleParas() [Test] public void GetUsfm_Verse_Table() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:9"), "Ninth verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:9"), "Ninth verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -376,9 +376,9 @@ public void GetUsfm_Verse_Table() [Test] public void GetUsfm_Verse_RangeSingleRowMultipleVerses() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - ( + new UpdateUsfmRow( ScrRef("MAT 2:11", "MAT 2:12"), "Eleventh verse of the second chapter. Twelfth verse of the second chapter." ) @@ -396,9 +396,9 @@ public void GetUsfm_Verse_RangeSingleRowMultipleVerses() [Test] public void GetUsfm_Verse_RangeSingleRowSingleVerse() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:11"), "Eleventh verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:11"), "Eleventh verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -408,10 +408,10 @@ public void GetUsfm_Verse_RangeSingleRowSingleVerse() [Test] public void GetUsfm_Verse_RangeMultipleRowsSingleVerse() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:11"), "Eleventh verse of the second chapter."), - (ScrRef("MAT 2:12"), "Twelfth verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:11"), "Eleventh verse of the second chapter."), + new UpdateUsfmRow(ScrRef("MAT 2:12"), "Twelfth verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -426,11 +426,11 @@ public void GetUsfm_Verse_RangeMultipleRowsSingleVerse() [Test] public void GetUsfm_MergeVerseSegments() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:2"), "Verse 2."), - (ScrRef("MAT 2:2a"), "Verse 2a."), - (ScrRef("MAT 2:2b"), "Verse 2b.") + new UpdateUsfmRow(ScrRef("MAT 2:2"), "Verse 2."), + new UpdateUsfmRow(ScrRef("MAT 2:2a"), "Verse 2a."), + new UpdateUsfmRow(ScrRef("MAT 2:2b"), "Verse 2b.") }; string target = UpdateUsfm(rows); @@ -440,10 +440,10 @@ public void GetUsfm_MergeVerseSegments() [Test] public void GetUsfm_Verse_OptBreak() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:2"), "Second verse of the second chapter."), - (ScrRef("MAT 2:3"), "Third verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:2"), "Second verse of the second chapter."), + new UpdateUsfmRow(ScrRef("MAT 2:3"), "Third verse of the second chapter.") }; string target = UpdateUsfm(rows, embedBehavior: UpdateUsfmMarkerBehavior.Strip); @@ -456,9 +456,9 @@ public void GetUsfm_Verse_OptBreak() [Test] public void GetUsfm_Verse_Milestone() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:10"), "Tenth verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:10"), "Tenth verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -471,9 +471,9 @@ public void GetUsfm_Verse_Milestone() [Test] public void GetUsfm_Verse_Unmatched() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:3"), "Third verse of the first chapter.") + new UpdateUsfmRow(ScrRef("MAT 1:3"), "Third verse of the first chapter.") }; string target = UpdateUsfm(rows); @@ -483,7 +483,7 @@ public void GetUsfm_Verse_Unmatched() [Test] public void GetUsfm_NonVerse_CharStyle() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 2:0/3:s1"), "The second chapter.") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 2:0/3:s1"), "The second chapter.") }; string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\s1 The second chapter.\r\n")); @@ -492,7 +492,7 @@ public void GetUsfm_NonVerse_CharStyle() [Test] public void GetUsfm_NonVerse_Paragraph() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:0/8:s"), "The first chapter.") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/8:s"), "The first chapter.") }; string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\s The first chapter.\r\n")); @@ -501,13 +501,13 @@ public void GetUsfm_NonVerse_Paragraph() [Test] public void GetUsfm_NonVerse_Relaxed() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/s"), "The first chapter."), - (ScrRef("MAT 1:1"), "First verse of the first chapter."), - (ScrRef("MAT 2:0/tr/tc1"), "The first cell of the table."), - (ScrRef("MAT 2:0/tr/tc2"), "The second cell of the table."), - (ScrRef("MAT 2:0/tr/tc1"), "The third cell of the table.") + new UpdateUsfmRow(ScrRef("MAT 1:0/s"), "The first chapter."), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "First verse of the first chapter."), + new UpdateUsfmRow(ScrRef("MAT 2:0/tr/tc1"), "The first cell of the table."), + new UpdateUsfmRow(ScrRef("MAT 2:0/tr/tc2"), "The second cell of the table."), + new UpdateUsfmRow(ScrRef("MAT 2:0/tr/tc1"), "The third cell of the table.") }; string target = UpdateUsfm(rows); @@ -531,9 +531,9 @@ public void GetUsfm_NonVerse_Relaxed() [Test] public void GetUsfm_NonVerse_Sidebar() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:3/1:esb/1:ms"), "The first paragraph of the sidebar.") + new UpdateUsfmRow(ScrRef("MAT 2:3/1:esb/1:ms"), "The first paragraph of the sidebar.") }; string target = UpdateUsfm(rows); @@ -543,10 +543,10 @@ public void GetUsfm_NonVerse_Sidebar() [Test] public void GetUsfm_NonVerse_Table() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:0/1:tr/1:tc1"), "The first cell of the table."), - (ScrRef("MAT 2:0/2:tr/1:tc1"), "The third cell of the table.") + new UpdateUsfmRow(ScrRef("MAT 2:0/1:tr/1:tc1"), "The first cell of the table."), + new UpdateUsfmRow(ScrRef("MAT 2:0/2:tr/1:tc1"), "The third cell of the table.") }; string target = UpdateUsfm(rows); @@ -563,9 +563,9 @@ public void GetUsfm_NonVerse_Table() [Test] public void GetUsfm_NonVerse_OptBreak() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:3/1:esb/2:p"), "The second paragraph of the sidebar.") + new UpdateUsfmRow(ScrRef("MAT 2:3/1:esb/2:p"), "The second paragraph of the sidebar.") }; string target = UpdateUsfm(rows); @@ -575,10 +575,7 @@ public void GetUsfm_NonVerse_OptBreak() [Test] public void GetUsfm_NonVerse_Milestone() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 2:7a/1:s"), "A new section header.") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 2:7a/1:s"), "A new section header.") }; string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\s A new section header. \\ts-s\\*\r\n")); @@ -587,10 +584,7 @@ public void GetUsfm_NonVerse_Milestone() [Test] public void GetUsfm_NonVerse_SkipNote() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows, embedBehavior: UpdateUsfmMarkerBehavior.Strip); Assert.That(target, Contains.Substring("\\ip The introductory paragraph.\r\n")); @@ -599,10 +593,7 @@ public void GetUsfm_NonVerse_SkipNote() [Test] public void GetUsfm_NonVerse_ReplaceWithNote() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows); Assert.That( @@ -614,9 +605,9 @@ public void GetUsfm_NonVerse_ReplaceWithNote() [Test] public void GetUsfm_Verse_DoubleVaVp() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 3:1"), "Updating later in the book to start.") + new UpdateUsfmRow(ScrRef("MAT 3:1"), "Updating later in the book to start.") }; string target = UpdateUsfm(rows); @@ -630,7 +621,7 @@ public void GetUsfm_Verse_DoubleVaVp() [Test] public void GetUsfm_Verse_LastSegment() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Updating the last verse.") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Updating the last verse.") }; string usfm = @"\id MAT - Test \c 1 @@ -653,14 +644,14 @@ public void GetUsfm_Verse_LastSegment() [Test] public void GetUsfm_Verse_PretranslationsBeforeText() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("GEN 1:1"), "Pretranslations before the start"), - (ScrRef("GEN 1:2"), "Pretranslations before the start"), - (ScrRef("GEN 1:3"), "Pretranslations before the start"), - (ScrRef("GEN 1:4"), "Pretranslations before the start"), - (ScrRef("GEN 1:5"), "Pretranslations before the start"), - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") + new UpdateUsfmRow(ScrRef("GEN 1:1"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:2"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:3"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:4"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:5"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows); @@ -673,10 +664,10 @@ public void GetUsfm_Verse_PretranslationsBeforeText() [Test] public void GetUsfm_StripParagraphs() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/2:p"), "Update Paragraph"), - (ScrRef("MAT 1:1"), "Update Verse 1") + new UpdateUsfmRow(ScrRef("MAT 1:0/2:p"), "Update Paragraph"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update Verse 1") }; var usfm = @@ -721,9 +712,9 @@ public void GetUsfm_StripParagraphs() [Test] public void GetUsfm_PreservationRawStrings() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), @"Update all in one row \f \fr 1.1 \ft Some note \f*") + new UpdateUsfmRow(ScrRef("MAT 1:1"), @"Update all in one row \f \fr 1.1 \ft Some note \f*") }; var usfm = @@ -744,7 +735,7 @@ public void GetUsfm_PreservationRawStrings() [Test] public void GetUsfm_BeginningOfVerseEmbed() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), @"Updated text") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), @"Updated text") }; var usfm = @"\id MAT - Test @@ -764,10 +755,7 @@ public void GetUsfm_BeginningOfVerseEmbed() [Test] public void CrossReferenceDontUpdate() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:1/1:x"), "Update the cross reference"), - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1/1:x"), "Update the cross reference"), }; var usfm = @"\id MAT - Test \c 1 @@ -785,7 +773,7 @@ public void CrossReferenceDontUpdate() [Test] public void PreserveFig() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update"), }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update"), }; var usfm = @"\id MAT - Test \c 1 @@ -803,10 +791,10 @@ public void PreserveFig() [Test] public void NoteExplicitEndMarkers() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update text"), - (ScrRef("MAT 1:1/1:f"), "Update note"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update text"), + new UpdateUsfmRow(ScrRef("MAT 1:1/1:f"), "Update note"), }; var usfm = @"\id MAT - Test @@ -833,7 +821,7 @@ public void NoteExplicitEndMarkers() [Test] public void UpdateBlock_Verse_PreserveParas() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -863,7 +851,7 @@ public void UpdateBlock_Verse_PreserveParas() [Test] public void UpdateBlock_Verse_StripParas() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -893,7 +881,7 @@ public void UpdateBlock_Verse_StripParas() [Test] public void UpdateBlock_Verse_Range() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -921,7 +909,7 @@ public void UpdateBlock_Verse_Range() [Test] public void UpdateBlock_Footnote_PreserveEmbeds() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -951,7 +939,7 @@ public void UpdateBlock_Footnote_PreserveEmbeds() [Test] public void UpdateBlock_Footnote_StripEmbeds() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -981,10 +969,7 @@ public void UpdateBlock_Footnote_StripEmbeds() [Test] public void UpdateBlock_NonVerse() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:0/1:s"), "Updated section Header") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/1:s"), "Updated section Header") }; var usfm = @"\id MAT - Test \s Section header @@ -1008,7 +993,7 @@ public void UpdateBlock_NonVerse() [Test] public void UpdateBlock_Verse_PreserveStyles() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -1040,7 +1025,7 @@ public void UpdateBlock_Verse_PreserveStyles() [Test] public void UpdateBlock_Verse_StripStyles() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -1072,7 +1057,7 @@ public void UpdateBlock_Verse_StripStyles() [Test] public void UpdateBlock_Verse_SectionHeader() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -1110,7 +1095,7 @@ public void UpdateBlock_Verse_SectionHeader() [Test] public void UpdateBlock_Verse_SectionHeaderInVerse() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -1143,7 +1128,7 @@ public void UpdateBlock_Verse_SectionHeaderInVerse() [Test] public void UpdateBlock_NonVerse_ParagraphEndOfVerse() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 @@ -1172,13 +1157,13 @@ public void UpdateBlock_NonVerse_ParagraphEndOfVerse() [Test] public void GetUsfm_HeaderReferenceParagraphs() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "new verse 1"), - (ScrRef("MAT 1:2"), "new verse 2"), - (ScrRef("MAT 1:3"), "new verse 3"), - (ScrRef("MAT 2:1"), "new verse 1"), - (ScrRef("MAT 2:2"), "new verse 2") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "new verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "new verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "new verse 3"), + new UpdateUsfmRow(ScrRef("MAT 2:1"), "new verse 1"), + new UpdateUsfmRow(ScrRef("MAT 2:2"), "new verse 2") }; var usfm = @@ -1227,10 +1212,10 @@ public void GetUsfm_HeaderReferenceParagraphs() [Test] public void GetUsfm_PreferExisting_AddRemark() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update 1"), - (ScrRef("MAT 1:2"), "Update 2"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), }; var usfm = @"\id MAT - Test @@ -1263,7 +1248,7 @@ private static ScriptureRef[] ScrRef(params string[] refs) } private static string UpdateUsfm( - IReadOnlyList<(IReadOnlyList, string)>? rows = null, + IReadOnlyList? rows = null, string? source = null, string? idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 63fe388a..30d8ecf7 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -28,9 +28,10 @@ public void ParseParallelCorpusAsync() Assert.That(rows, Has.Count.GreaterThan(0)); // insert the source into the target as pretranslations to make sure that USFM generation works - IReadOnlyList<(IReadOnlyList, string)> pretranslations = rows.Select(r => - ((IReadOnlyList)r.SourceRefs.Select(s => (ScriptureRef)s).ToList(), r.SourceText) - ) + IReadOnlyList pretranslations = rows.Select(r => new UpdateUsfmRow( + (IReadOnlyList)r.SourceRefs.Select(s => (ScriptureRef)s).ToList(), + r.SourceText + )) .ToList(); ParatextProjectSettings targetSettings = new FileParatextProjectSettingsParser( @@ -96,20 +97,17 @@ async Task GetUsfmAsync(string projectPath) // Read text from pretranslations file using Stream pretranslationStream = File.OpenRead(PretranslationPath); - (IReadOnlyList, string)[] pretranslations = await JsonSerializer + UpdateUsfmRow[] pretranslations = await JsonSerializer .DeserializeAsyncEnumerable( pretranslationStream, new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase } ) - .Select(p => - ( - (IReadOnlyList)( - p?.Refs.Select(r => ScriptureRef.Parse(r, settings.Versification).ToRelaxed()).ToArray() - ?? [] - ), - p?.Translation ?? "" - ) - ) + .Select(p => new UpdateUsfmRow( + (IReadOnlyList)( + p?.Refs.Select(r => ScriptureRef.Parse(r, settings.Versification).ToRelaxed()).ToArray() ?? [] + ), + p?.Translation ?? "" + )) .ToArrayAsync(); List bookIds = []; ParatextProjectTextUpdaterBase updater; From 480619e7cf028b3725432ea244b0ed537fa0d1d4 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 6 Aug 2025 12:18:45 -0400 Subject: [PATCH 21/28] Move PunctuationAnalysis out of Corpora --- src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs | 2 +- .../Corpora/ParatextProjectQuoteConventionDetector.cs | 2 +- .../Corpora/QuotationMarkDenormalizationFirstPass.cs | 2 +- .../QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs | 2 +- src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs | 2 +- .../Corpora/QuotationMarkUpdateResolutionSettings.cs | 2 +- .../Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs | 2 +- src/SIL.Machine/{Corpora => }/PunctuationAnalysis/Chapter.cs | 2 +- .../PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs | 2 +- .../PunctuationAnalysis/IQuotationMarkResolutionSettings.cs | 2 +- .../PunctuationAnalysis/IQuotationMarkResolver.cs | 2 +- .../PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs | 2 +- .../PunctuationAnalysis/QuotationMarkDirection.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/QuotationMarkFinder.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/QuotationMarkMetadata.cs | 2 +- .../PunctuationAnalysis/QuotationMarkResolutionIssue.cs | 2 +- .../PunctuationAnalysis/QuotationMarkStringMatch.cs | 2 +- .../PunctuationAnalysis/QuotationMarkTabulator.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/QuoteConvention.cs | 2 +- .../QuoteConventionDetectionResolutionSettings.cs | 2 +- .../PunctuationAnalysis/QuoteConventionDetector.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/QuoteConventionSet.cs | 2 +- .../PunctuationAnalysis/StandardQuoteConventions.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/TextSegment.cs | 3 ++- .../{Corpora => }/PunctuationAnalysis/UsfmMarkerType.cs | 2 +- .../PunctuationAnalysis/UsfmStructureExtractor.cs | 3 ++- src/SIL.Machine/{Corpora => }/PunctuationAnalysis/Verse.cs | 2 +- .../Corpora/FallbackQuotationMarkResolverTests.cs | 2 +- .../Corpora/ParatextProjectQuoteConvetionDetectorTests.cs | 2 +- .../SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs | 2 +- .../QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs | 2 +- .../Corpora/QuotationMarkUpdateFirstPassTests.cs | 2 +- .../QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/ChapterTests.cs | 2 +- .../DepthBasedQuotationMarkResolverTests.cs | 2 +- .../PreliminaryQuotationMarkAnalyzerTests.cs | 2 +- .../PunctuationAnalysis/QuotationConventionDetectorTests.cs | 3 ++- .../PunctuationAnalysis/QuotationMarkFinderTests.cs | 2 +- .../PunctuationAnalysis/QuotationMarkMetadataTests.cs | 2 +- .../PunctuationAnalysis/QuotationMarkResolverTests.cs | 2 +- .../PunctuationAnalysis/QuotationMarkStringMatchTests.cs | 2 +- .../PunctuationAnalysis/QuotationMarkTabulatorTests.cs | 2 +- .../PunctuationAnalysis/QuoteConventionSetTests.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/QuoteConventionTests.cs | 2 +- .../{Corpora => }/PunctuationAnalysis/TextSegmentTests.cs | 3 ++- .../PunctuationAnalysis/UsfmStructureExtractorTests.cs | 3 ++- .../{Corpora => }/PunctuationAnalysis/VerseTests.cs | 2 +- 47 files changed, 52 insertions(+), 47 deletions(-) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/Chapter.cs (81%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs (93%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/IQuotationMarkResolver.cs (85%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuotationMarkDirection.cs (63%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuotationMarkFinder.cs (98%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuotationMarkMetadata.cs (98%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuotationMarkResolutionIssue.cs (81%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuotationMarkStringMatch.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuotationMarkTabulator.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuoteConvention.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs (97%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuoteConventionDetector.cs (98%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/QuoteConventionSet.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/StandardQuoteConventions.cs (99%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/TextSegment.cs (98%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/UsfmMarkerType.cs (76%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/UsfmStructureExtractor.cs (98%) rename src/SIL.Machine/{Corpora => }/PunctuationAnalysis/Verse.cs (92%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/ChapterTests.cs (95%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuotationConventionDetectorTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuotationMarkFinderTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuotationMarkMetadataTests.cs (98%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuotationMarkResolverTests.cs (98%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuotationMarkStringMatchTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuotationMarkTabulatorTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuoteConventionSetTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/QuoteConventionTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/TextSegmentTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/UsfmStructureExtractorTests.cs (99%) rename tests/SIL.Machine.Tests/{Corpora => }/PunctuationAnalysis/VerseTests.cs (97%) diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs index 65c753c8..a1a09321 100644 --- a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; using System.Linq; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora { diff --git a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs index 786352a0..db2c6a92 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs @@ -1,7 +1,7 @@ using System; using System.IO; using System.Text; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora { diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs index 9a8c0050..c90827d5 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs @@ -1,4 +1,4 @@ -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora { diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs index 52bd8abb..f5ac923f 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs @@ -1,4 +1,4 @@ -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora { diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index 49d55e52..c6f6e796 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -1,7 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora { diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs index 33c065f7..7791d048 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings { diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs index d7b9f289..81e1945e 100644 --- a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; using System.Linq; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/Chapter.cs b/src/SIL.Machine/PunctuationAnalysis/Chapter.cs similarity index 81% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/Chapter.cs rename to src/SIL.Machine/PunctuationAnalysis/Chapter.cs index 8e299d69..77ccf26b 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/Chapter.cs +++ b/src/SIL.Machine/PunctuationAnalysis/Chapter.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class Chapter { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs rename to src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index a7482bb2..22f89991 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -3,7 +3,7 @@ using System.Linq; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkResolverState { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs similarity index 93% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs rename to src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs index 19064149..4e8f4721 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs +++ b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public interface IQuotationMarkResolutionSettings { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs similarity index 85% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolver.cs rename to src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs index c7112c33..68c1f69c 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/IQuotationMarkResolver.cs +++ b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public interface IQuotationMarkResolver { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs rename to src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs index f7c78a5e..ab39764e 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs +++ b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -4,7 +4,7 @@ using System.Text.RegularExpressions; using SIL.Extensions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class ApostropheProportionStatistics { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkDirection.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDirection.cs similarity index 63% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkDirection.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkDirection.cs index 974955a7..52d63b33 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkDirection.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDirection.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public enum QuotationMarkDirection { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs similarity index 98% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index 85d89d19..44db38e7 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -3,7 +3,7 @@ using System.Linq; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkFinder { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs similarity index 98% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs index 991f546d..7114105f 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkMetadata.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkMetadata { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkResolutionIssue.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkResolutionIssue.cs similarity index 81% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkResolutionIssue.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkResolutionIssue.cs index 1f06a56d..233dc45c 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkResolutionIssue.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkResolutionIssue.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public enum QuotationMarkResolutionIssue { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs index 97cc99af..448aed45 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -3,7 +3,7 @@ using System.Text.RegularExpressions; using System.Unicode; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkStringMatch { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index 7fd91b27..e12a2054 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -3,7 +3,7 @@ using System.Text; using SIL.Extensions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkCounts { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs rename to src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs index 9e7c02be..c8e17e85 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConvention.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Linq; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class SingleLevelQuoteConvention { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs similarity index 97% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs rename to src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs index 05583408..f030b4cc 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Text.RegularExpressions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuoteConventionDetectionResolutionSettings : IQuotationMarkResolutionSettings { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs similarity index 98% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs rename to src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs index ef5830cd..7d971d79 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Linq; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuoteConventionAnalysis { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs rename to src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs index ebec0b34..41edcda9 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/QuoteConventionSet.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs @@ -4,7 +4,7 @@ using System.Text.RegularExpressions; using SIL.Extensions; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class QuoteConventionSet { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/StandardQuoteConventions.cs b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs similarity index 99% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/StandardQuoteConventions.cs rename to src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs index c3e3f974..041081c9 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/StandardQuoteConventions.cs +++ b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class StandardQuoteConventions { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs similarity index 98% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs rename to src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index be35bca6..8f886ac5 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -1,6 +1,7 @@ using System.Collections.Generic; +using SIL.Machine.Corpora; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class TextSegment { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmMarkerType.cs b/src/SIL.Machine/PunctuationAnalysis/UsfmMarkerType.cs similarity index 76% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmMarkerType.cs rename to src/SIL.Machine/PunctuationAnalysis/UsfmMarkerType.cs index f3fe1742..5e61d470 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmMarkerType.cs +++ b/src/SIL.Machine/PunctuationAnalysis/UsfmMarkerType.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public enum UsfmMarkerType { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs similarity index 98% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs rename to src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs index 98cc4bcf..ce2d6cd7 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/UsfmStructureExtractor.cs +++ b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -1,6 +1,7 @@ using System.Collections.Generic; +using SIL.Machine.Corpora; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class UsfmStructureExtractor : IUsfmParserHandler { diff --git a/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs b/src/SIL.Machine/PunctuationAnalysis/Verse.cs similarity index 92% rename from src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs rename to src/SIL.Machine/PunctuationAnalysis/Verse.cs index 40f63776..47b55766 100644 --- a/src/SIL.Machine/Corpora/PunctuationAnalysis/Verse.cs +++ b/src/SIL.Machine/PunctuationAnalysis/Verse.cs @@ -1,7 +1,7 @@ using System.Collections.Generic; using System.Linq; -namespace SIL.Machine.Corpora.PunctuationAnalysis +namespace SIL.Machine.PunctuationAnalysis { public class Verse { diff --git a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs index ac0bd815..a323dbc5 100644 --- a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora; diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs index 09fff927..46a75faa 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs @@ -1,6 +1,6 @@ using System.Text; using NUnit.Framework; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; using SIL.Scripture; namespace SIL.Machine.Corpora; diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs index 4f2fd136..73f27655 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora; diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs index 5bb2d5a2..599e2699 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora; diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs index c38d349a..e7922cec 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora; diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs index 3a2284ad..94f73444 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using SIL.Machine.Corpora.PunctuationAnalysis; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora; diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/ChapterTests.cs similarity index 95% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/ChapterTests.cs index c7650538..d06e55f4 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/ChapterTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/ChapterTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class ChapterTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs index 7775e9a4..d3689202 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -1,7 +1,7 @@ using System.Unicode; using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class DepthBasedQuotationMarkResolverTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs index 97e23bb2..29d04e55 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class PreliminaryQuotationMarkAnalyzerTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationConventionDetectorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationConventionDetectorTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs index 9891b6af..fc75c264 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationConventionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs @@ -1,6 +1,7 @@ using NUnit.Framework; +using SIL.Machine.Corpora; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuotationConventionDetectorTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index 4ebd19e8..a1cbdc5c 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuotationMarkFinderTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs similarity index 98% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs index 9c195afe..3fca092d 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkMetadataTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuotationMarkMetadataTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs similarity index 98% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkResolverTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs index 371e098f..e75bf30f 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuotationMarkResolverTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs index d4a1fa3e..26fc1c37 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkStringMatchTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -1,7 +1,7 @@ using System.Text.RegularExpressions; using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuotationMarkStringMatchTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs index 9c52899b..c3daec46 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuotationMarkTabulatorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuotationMarkTabulatorTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs index 3206cf3e..ed25d4e2 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionSetTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs @@ -2,7 +2,7 @@ using System.Text.RegularExpressions; using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuoteConventionSetTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs index 52e54785..e7953468 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/QuoteConventionTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class QuoteConventionTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs index d1a6362c..967c0f3b 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -1,6 +1,7 @@ using NUnit.Framework; +using SIL.Machine.Corpora; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class TextSegmentTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs index 042c5799..cc7bea18 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/UsfmStructureExtractorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs @@ -1,7 +1,8 @@ using NUnit.Framework; +using SIL.Machine.Corpora; using SIL.Scripture; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class UsfmStructureExtractorTests diff --git a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/VerseTests.cs similarity index 97% rename from tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/VerseTests.cs index d072fae1..4352c814 100644 --- a/tests/SIL.Machine.Tests/Corpora/PunctuationAnalysis/VerseTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/VerseTests.cs @@ -1,6 +1,6 @@ using NUnit.Framework; -namespace SIL.Machine.Corpora.PunctuationAnalysis; +namespace SIL.Machine.PunctuationAnalysis; [TestFixture] public class VerseTests From 26a96b691381a07a67696f9627fd524466e5df7b Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 6 Aug 2025 14:07:16 -0400 Subject: [PATCH 22/28] Respond to reviewer comments --- .../Corpora/FallbackQuotationMarkResolver.cs | 2 +- .../Corpora/QuotationMarkUpdateFirstPass.cs | 6 +- ...onventionChangingUsfmUpdateBlockHandler.cs | 60 +++---- src/SIL.Machine/Corpora/UsfmToken.cs | 24 +-- .../PunctuationAnalysis/Chapter.cs | 5 +- .../DepthBasedQuotationMarkResolver.cs | 55 ++++--- .../IQuotationMarkResolver.cs | 4 +- .../PreliminaryQuotationMarkAnalyzer.cs | 46 +++--- .../QuotationMarkFinder.cs | 2 +- .../QuotationMarkMetadata.cs | 9 +- .../QuoteConventionDetector.cs | 7 +- .../PunctuationAnalysis/QuoteConventionSet.cs | 12 +- .../StandardQuoteConventions.cs | 4 +- .../PunctuationAnalysis/TextSegment.cs | 22 ++- src/SIL.Machine/PunctuationAnalysis/Verse.cs | 2 +- .../FallbackQuotationMarkResolverTests.cs | 48 ++---- .../Corpora/QuotationDenormalizationTests.cs | 4 +- ...ormalizationUsfmBlockUpdateHandlerTests.cs | 2 +- .../QuotationMarkUpdateFirstPassTests.cs | 18 +-- ...tionChangingUsfmBlockUpdateHandlerTests.cs | 67 ++++---- .../DepthBasedQuotationMarkResolverTests.cs | 150 +++++------------- .../QuotationMarkFinderTests.cs | 23 +-- .../QuotationMarkMetadataTests.cs | 2 +- .../QuotationMarkResolverTests.cs | 16 +- 24 files changed, 259 insertions(+), 331 deletions(-) diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs index a1a09321..7e4a1af6 100644 --- a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -24,7 +24,7 @@ public void Reset() } public IEnumerable ResolveQuotationMarks( - List quotationMarkMatches + IReadOnlyList quotationMarkMatches ) { foreach (QuotationMarkStringMatch quoteMatch in quotationMarkMatches) diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs index c6f6e796..f5106501 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -8,19 +8,15 @@ namespace SIL.Machine.Corpora // Determines the best strategy to take for each chapter public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor { - private readonly QuoteConvention _sourceQuoteConvention; - private readonly QuoteConvention _targetQuoteConvention; private readonly QuotationMarkFinder _quotationMarkFinder; private readonly DepthBasedQuotationMarkResolver _quotationMarkResolver; - public bool WillFallbackModeWork; + public bool WillFallbackModeWork { get; set; } public QuotationMarkUpdateFirstPass( QuoteConvention sourceQuoteConvention, QuoteConvention targetQuoteConvention ) { - _sourceQuoteConvention = sourceQuoteConvention; - _targetQuoteConvention = targetQuoteConvention; _quotationMarkFinder = new QuotationMarkFinder( new QuoteConventionSet(new List { sourceQuoteConvention, targetQuoteConvention }) ); diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs index 81e1945e..0817854d 100644 --- a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -9,13 +9,13 @@ public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHan private readonly QuoteConvention _sourceQuoteConvention; private readonly QuoteConvention _targetQuoteConvention; private readonly QuotationMarkUpdateSettings _settings; - protected QuotationMarkFinder _quotationMarkFinder; - protected TextSegment.Builder _nextScriptureTextSegmentBuilder; - protected IQuotationMarkResolver _verseTextQuotationMarkResolver; + protected QuotationMarkFinder QuotationMarkFinder { get; set; } + protected TextSegment.Builder NextScriptureTextSegmentBuilder { get; set; } + protected IQuotationMarkResolver VerseTextQuotationMarkResolver { get; set; } private readonly IQuotationMarkResolver _embedQuotationMarkResolver; private readonly IQuotationMarkResolver _simpleQuotationMarkResolver; - protected QuotationMarkUpdateStrategy _currentStrategy; - protected int _currentChapterNumber; + protected QuotationMarkUpdateStrategy CurrentStrategy { get; set; } + protected int CurrentChapterNumber { get; set; } private int _currentVerseNumber; public QuoteConventionChangingUsfmUpdateBlockHandler( @@ -28,11 +28,11 @@ QuotationMarkUpdateSettings settings _targetQuoteConvention = targetQuoteConvention; _settings = settings; - _quotationMarkFinder = new QuotationMarkFinder( + QuotationMarkFinder = new QuotationMarkFinder( new QuoteConventionSet(new List { _sourceQuoteConvention }) ); - _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); + NextScriptureTextSegmentBuilder = new TextSegment.Builder(); IQuotationMarkResolutionSettings resolutionSettings = new QuotationMarkUpdateResolutionSettings( sourceQuoteConvention @@ -41,12 +41,12 @@ QuotationMarkUpdateSettings settings // Each embed represents a separate context for quotation marks // (i.e. you can't open a quote in one context and close it in another) // so we need to keep track of the verse and embed contexts separately. - _verseTextQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); + VerseTextQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); _embedQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); _simpleQuotationMarkResolver = new FallbackQuotationMarkResolver(resolutionSettings); - _currentStrategy = QuotationMarkUpdateStrategy.ApplyFull; - _currentChapterNumber = 0; + CurrentStrategy = QuotationMarkUpdateStrategy.ApplyFull; + CurrentChapterNumber = 0; _currentVerseNumber = 0; } @@ -54,9 +54,9 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { CheckForChapterChange(block); CheckForVerseChange(block); - if (_currentStrategy == QuotationMarkUpdateStrategy.Skip) + if (CurrentStrategy == QuotationMarkUpdateStrategy.Skip) return block; - if (_currentStrategy == QuotationMarkUpdateStrategy.ApplyFallback) + if (CurrentStrategy == QuotationMarkUpdateStrategy.ApplyFallback) { return ApplyFallbackUpdating(block); } @@ -81,7 +81,7 @@ private UsfmUpdateBlock ApplyStandardUpdating(UsfmUpdateBlock block) } else { - ProcessScriptureElement(element, _verseTextQuotationMarkResolver); + ProcessScriptureElement(element, VerseTextQuotationMarkResolver); } } return block; @@ -94,7 +94,7 @@ IQuotationMarkResolver quotationMarkResolver { List textSegments = CreateTextSegments(element); List quotationMarkMatches = - _quotationMarkFinder.FindAllPotentialQuotationMarksInTextSegments(textSegments); + QuotationMarkFinder.FindAllPotentialQuotationMarksInTextSegments(textSegments); List resolvedQuotationMarkMatches = quotationMarkResolver .ResolveQuotationMarks(quotationMarkMatches) .ToList(); @@ -109,16 +109,16 @@ protected List CreateTextSegments(UsfmUpdateBlockElement element) switch (token.Type) { case UsfmTokenType.Verse: - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); break; case UsfmTokenType.Paragraph: - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Paragraph); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Paragraph); break; case UsfmTokenType.Character: - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); break; case UsfmTokenType.Note: - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); break; case UsfmTokenType.Text: TextSegment textSegment = CreateTextSegment(token); @@ -167,13 +167,13 @@ int shiftAmount protected TextSegment CreateTextSegment(UsfmToken token) { TextSegment textSegmentToReturn = null; - _nextScriptureTextSegmentBuilder.SetUsfmToken(token); + NextScriptureTextSegmentBuilder.SetUsfmToken(token); if (token.Text != null) { - _nextScriptureTextSegmentBuilder.SetText(token.Text); - textSegmentToReturn = _nextScriptureTextSegmentBuilder.Build(); + NextScriptureTextSegmentBuilder.SetText(token.Text); + textSegmentToReturn = NextScriptureTextSegmentBuilder.Build(); } - _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); + NextScriptureTextSegmentBuilder = new TextSegment.Builder(); return textSegmentToReturn; } @@ -193,7 +193,7 @@ protected void CheckForChapterChange(UsfmUpdateBlock block) { foreach (ScriptureRef scriptureRef in block.Refs) { - if (scriptureRef.ChapterNum != _currentChapterNumber) + if (scriptureRef.ChapterNum != CurrentChapterNumber) { StartNewChapter(scriptureRef.ChapterNum); } @@ -202,18 +202,18 @@ protected void CheckForChapterChange(UsfmUpdateBlock block) protected void StartNewChapter(int newChapterNumber) { - _currentChapterNumber = newChapterNumber; - _currentStrategy = _settings.GetActionForChapter(newChapterNumber); - _verseTextQuotationMarkResolver.Reset(); - _nextScriptureTextSegmentBuilder = new TextSegment.Builder(); - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); + CurrentChapterNumber = newChapterNumber; + CurrentStrategy = _settings.GetActionForChapter(newChapterNumber); + VerseTextQuotationMarkResolver.Reset(); + NextScriptureTextSegmentBuilder = new TextSegment.Builder(); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); } private void CheckForVerseChange(UsfmUpdateBlock block) { foreach (ScriptureRef scriptureRef in block.Refs) { - if (scriptureRef.ChapterNum == _currentChapterNumber && scriptureRef.VerseNum != _currentVerseNumber) + if (scriptureRef.ChapterNum == CurrentChapterNumber && scriptureRef.VerseNum != _currentVerseNumber) { StartNewVerse(scriptureRef.VerseNum); } @@ -223,7 +223,7 @@ private void CheckForVerseChange(UsfmUpdateBlock block) private void StartNewVerse(int newVerseNumber) { _currentVerseNumber = newVerseNumber; - _nextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); } } } diff --git a/src/SIL.Machine/Corpora/UsfmToken.cs b/src/SIL.Machine/Corpora/UsfmToken.cs index 2bc97322..90b934f2 100644 --- a/src/SIL.Machine/Corpora/UsfmToken.cs +++ b/src/SIL.Machine/Corpora/UsfmToken.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; @@ -21,7 +22,7 @@ public enum UsfmTokenType Unknown } - public class UsfmToken + public class UsfmToken : IEquatable { private const string FullAttributeStr = @"(?[-\w]+)\s*\=\s*\""(?.+?)\""\s*"; private static readonly Regex AttributeRegex = new Regex( @@ -68,17 +69,22 @@ public override bool Equals(object obj) { if (obj is UsfmToken other) { - return Type == other.Type - && Marker == other.Marker - && Text == other.Text - && EndMarker == other.EndMarker - && Data == other.Data - && LineNumber == other.LineNumber - && ColumnNumber == other.ColumnNumber; + return Equals(other); } return false; } + public bool Equals(UsfmToken other) + { + return Type == other.Type + && Marker == other.Marker + && Text == other.Text + && EndMarker == other.EndMarker + && Data == other.Data + && LineNumber == other.LineNumber + && ColumnNumber == other.ColumnNumber; + } + public override int GetHashCode() { int hashCode = 23; diff --git a/src/SIL.Machine/PunctuationAnalysis/Chapter.cs b/src/SIL.Machine/PunctuationAnalysis/Chapter.cs index 77ccf26b..a5c5bc62 100644 --- a/src/SIL.Machine/PunctuationAnalysis/Chapter.cs +++ b/src/SIL.Machine/PunctuationAnalysis/Chapter.cs @@ -1,12 +1,13 @@ using System.Collections.Generic; +using System.Linq; namespace SIL.Machine.PunctuationAnalysis { public class Chapter { - public Chapter(List verses) + public Chapter(IEnumerable verses) { - Verses = verses; + Verses = verses.ToList(); } public List Verses { get; set; } diff --git a/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs index 22f89991..e48e6a7e 100644 --- a/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs +++ b/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Collections.Immutable; using System.Linq; using System.Text.RegularExpressions; @@ -7,19 +8,24 @@ namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkResolverState { - public Stack Quotations { get; private set; } + public ImmutableStack Quotations + { + get => ImmutableStack.CreateRange(_quotations); + } + + private readonly Stack _quotations; public QuotationMarkResolverState() { - Reset(); + _quotations = new Stack(); } public void Reset() { - Quotations = new Stack(); + _quotations.Clear(); } - public int CurrentDepth => Quotations.Count; + public int CurrentDepth => _quotations.Count; public bool HasOpenQuotationMark => CurrentDepth > 0; @@ -31,7 +37,7 @@ public QuotationMarkMetadata AddOpeningQuotationMark(QuotationMarkStringMatch qu CurrentDepth + 1, QuotationMarkDirection.Opening ); - Quotations.Push(quotationMark); + _quotations.Push(quotationMark); return quotationMark; } @@ -41,7 +47,7 @@ public QuotationMarkMetadata AddClosingQuotationMark(QuotationMarkStringMatch qu CurrentDepth, QuotationMarkDirection.Closing ); - Quotations.Pop(); + _quotations.Pop(); return quotationMark; } @@ -54,7 +60,7 @@ public string GetOpeningQuotationMarkAtDepth(int depth) ); } // Stack is stored in reverse order - return Quotations.ToArray()[CurrentDepth - depth].QuotationMark; + return _quotations.ToArray()[CurrentDepth - depth].QuotationMark; } public string GetDeepestOpeningQuotationMark() @@ -65,7 +71,7 @@ public string GetDeepestOpeningQuotationMark() "The deepest opening quotation mark was requested from an empty quotation stack." ); } - return Quotations.Peek().QuotationMark; + return _quotations.Peek().QuotationMark; } } @@ -78,24 +84,29 @@ public enum QuoteContinuerStyle public class QuoteContinuerState { - public Stack QuoteContinuerMarks { get; private set; } + private readonly Stack _quoteContinuerMarks; + public ImmutableStack QuoteContinuerMarks + { + get => ImmutableStack.CreateRange(_quoteContinuerMarks); + } public QuoteContinuerStyle ContinuerStyle { get; protected set; } - public int CurrentDepth => QuoteContinuerMarks.Count; + public int CurrentDepth => _quoteContinuerMarks.Count; public QuoteContinuerState() { - Reset(); + _quoteContinuerMarks = new Stack(); + ContinuerStyle = QuoteContinuerStyle.Undetermined; } public void Reset() { - QuoteContinuerMarks = new Stack(); + _quoteContinuerMarks.Clear(); ContinuerStyle = QuoteContinuerStyle.Undetermined; } public bool ContinuerHasBeenObserved() { - return QuoteContinuerMarks.Count > 0; + return _quoteContinuerMarks.Count > 0; } public QuotationMarkMetadata AddQuoteContinuer( @@ -105,14 +116,14 @@ QuoteContinuerStyle quoteContinuerStyle ) { QuotationMarkMetadata quote = quotationMarkMatch.Resolve( - QuoteContinuerMarks.Count + 1, + _quoteContinuerMarks.Count + 1, QuotationMarkDirection.Opening ); - QuoteContinuerMarks.Push(quote); + _quoteContinuerMarks.Push(quote); ContinuerStyle = quoteContinuerStyle; if (CurrentDepth == quotationMarkResolverState.CurrentDepth) { - QuoteContinuerMarks.Clear(); + _quoteContinuerMarks.Clear(); } return quote; } @@ -383,11 +394,11 @@ public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationM public class DepthBasedQuotationMarkResolver : IQuotationMarkResolver { - public readonly IQuotationMarkResolutionSettings Settings; - public readonly QuotationMarkResolverState QuotationMarkResolverState; - public readonly QuoteContinuerState QuoteContinuerState; - public readonly QuotationMarkCategorizer QuotationMarkCategorizer; - protected readonly HashSet Issues; + public IQuotationMarkResolutionSettings Settings { get; } + public QuotationMarkResolverState QuotationMarkResolverState { get; } + public QuoteContinuerState QuoteContinuerState { get; } + public QuotationMarkCategorizer QuotationMarkCategorizer { get; } + protected HashSet Issues { get; } public DepthBasedQuotationMarkResolver(IQuotationMarkResolutionSettings settings) { @@ -410,7 +421,7 @@ public virtual void Reset() } public virtual IEnumerable ResolveQuotationMarks( - List quotationMarkMatches + IReadOnlyList quotationMarkMatches ) { foreach ( diff --git a/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs index 68c1f69c..d34e95ba 100644 --- a/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs +++ b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs @@ -4,7 +4,9 @@ namespace SIL.Machine.PunctuationAnalysis { public interface IQuotationMarkResolver { - IEnumerable ResolveQuotationMarks(List quotationMarkMatches); + IEnumerable ResolveQuotationMarks( + IReadOnlyList quotationMarkMatches + ); void Reset(); HashSet GetIssues(); } diff --git a/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs index ab39764e..867119a0 100644 --- a/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs +++ b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -44,17 +44,12 @@ public class QuotationMarkWordPositions { private static readonly double MaximumProportionForRarity = 0.1; private static readonly double MaximumProportionDifferenceThreshold = 0.3; - private Dictionary _wordInitialOccurrences; - private Dictionary _midWordOccurrences; - private Dictionary _wordFinalOccurrences; - private Dictionary _totalOccurrences; + private readonly Dictionary _wordInitialOccurrences; + private readonly Dictionary _midWordOccurrences; + private readonly Dictionary _wordFinalOccurrences; + private readonly Dictionary _totalOccurrences; public QuotationMarkWordPositions() - { - Reset(); - } - - public void Reset() { _wordInitialOccurrences = new Dictionary(); _midWordOccurrences = new Dictionary(); @@ -62,6 +57,14 @@ public void Reset() _totalOccurrences = new Dictionary(); } + public void Reset() + { + _wordInitialOccurrences.Clear(); + _midWordOccurrences.Clear(); + _wordFinalOccurrences.Clear(); + _totalOccurrences.Clear(); + } + public void CountWordInitialApostrophe(string quotationMark) { _wordInitialOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); @@ -141,18 +144,19 @@ public class QuotationMarkSequences private static readonly int MuchMoreCommonMinimumRatio = 10; private static readonly double MaximumProportionDifferenceThreshold = 0.2; - private Dictionary _earlierQuotationMarkCounts; - private Dictionary _laterQuotationMarkCounts; + private readonly Dictionary _earlierQuotationMarkCounts; + private readonly Dictionary _laterQuotationMarkCounts; public QuotationMarkSequences() { - Reset(); + _earlierQuotationMarkCounts = new Dictionary(); + _laterQuotationMarkCounts = new Dictionary(); } public void Reset() { - _earlierQuotationMarkCounts = new Dictionary(); - _laterQuotationMarkCounts = new Dictionary(); + _earlierQuotationMarkCounts.Clear(); + _laterQuotationMarkCounts.Clear(); } public void CountEarlierQuotationMark(string quotationMark) @@ -204,7 +208,7 @@ public bool AreEarlyAndLateMarkRatesSimilar(string quotationMark) public class QuotationMarkGrouper { private readonly QuoteConventionSet _quoteConventions; - private Dictionary> _groupedQuotationMarks; + private readonly Dictionary> _groupedQuotationMarks; public QuotationMarkGrouper( List quotationMarks, @@ -212,14 +216,14 @@ QuoteConventionSet quoteConventionSet ) { _quoteConventions = quoteConventionSet; - GroupQuotationMarks(quotationMarks); + _groupedQuotationMarks = GroupQuotationMarks(quotationMarks); } - private void GroupQuotationMarks(List quotationMarks) + private Dictionary> GroupQuotationMarks( + List quotationMarks + ) { - _groupedQuotationMarks = quotationMarks - .GroupBy(qmm => qmm.QuotationMark) - .ToDictionary(g => g.Key, g => g.ToList()); + return quotationMarks.GroupBy(qmm => qmm.QuotationMark).ToDictionary(g => g.Key, g => g.ToList()); } public IEnumerable<(string Mark1, string Mark2)> GetQuotationMarkPairs() @@ -415,7 +419,7 @@ private void AnalyzeQuotationMarksForVerse(Verse verse) _quoteConventions ).FindAllPotentialQuotationMarksInVerse(verse); AnalyzeQuotationMarkSequence(quotationMarks); - _apostropheAnalyzer.ProcessQuotationMarks(verse.TextSegments, quotationMarks); + _apostropheAnalyzer.ProcessQuotationMarks(verse.TextSegments.ToList(), quotationMarks); } private void AnalyzeQuotationMarkSequence(List quotationMarks) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index 44db38e7..d51aa820 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -29,7 +29,7 @@ public List FindAllPotentialQuotationMarksInVerse(Vers } public virtual List FindAllPotentialQuotationMarksInTextSegments( - List textSegments + IReadOnlyList textSegments ) { return textSegments.SelectMany(ts => FindAllPotentialQuotationMarksInTextSegment(ts)).ToList(); diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs index 7114105f..b6530b9d 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs @@ -1,6 +1,8 @@ +using System; + namespace SIL.Machine.PunctuationAnalysis { - public class QuotationMarkMetadata + public class QuotationMarkMetadata : IEquatable { public string QuotationMark { get; } public int Depth { get; } @@ -40,6 +42,11 @@ public override bool Equals(object obj) { return false; } + return Equals(other); + } + + public bool Equals(QuotationMarkMetadata other) + { return QuotationMark.Equals(other.QuotationMark) && Depth.Equals(other.Depth) && Direction.Equals(other.Direction) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs index 7d971d79..bd6c7fea 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs @@ -34,7 +34,7 @@ public QuoteConventionDetector() private void CountQuotationMarksInChapters(List chapters) { QuoteConventionSet possibleQuoteConventions = new PreliminaryQuotationMarkAnalyzer( - StandardQuoteConventions.QuoteConventions + QuoteConventions.Standard ).NarrowDownPossibleQuoteConventions(chapters); foreach (Chapter chapter in chapters) @@ -60,8 +60,9 @@ public QuoteConventionAnalysis DetectQuotationConvention() { CountQuotationMarksInChapters(GetChapters()); - (QuoteConvention bestQuoteConvention, double score) = - StandardQuoteConventions.QuoteConventions.FindMostSimilarConvention(_quotationMarkTabulator); + (QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention( + _quotationMarkTabulator + ); if (score > 0 && bestQuoteConvention != null) { diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs index 41edcda9..f208df92 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; @@ -6,7 +7,7 @@ namespace SIL.Machine.PunctuationAnalysis { - public class QuoteConventionSet + public class QuoteConventionSet : IEquatable { public IReadOnlyList Conventions { get; } @@ -26,9 +27,14 @@ public QuoteConventionSet(List conventions) public override bool Equals(object obj) { - if (!(obj is QuoteConventionSet quoteConventionSet)) + if (!(obj is QuoteConventionSet other)) return false; - return Conventions.SequenceEqual(quoteConventionSet.Conventions); + return Equals(other); + } + + public bool Equals(QuoteConventionSet other) + { + return Conventions.SequenceEqual(other.Conventions); } public override int GetHashCode() diff --git a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs index 041081c9..6a3633c6 100644 --- a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs +++ b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs @@ -2,9 +2,9 @@ namespace SIL.Machine.PunctuationAnalysis { - public class StandardQuoteConventions + public class QuoteConventions { - public static QuoteConventionSet QuoteConventions = new QuoteConventionSet( + public static readonly QuoteConventionSet Standard = new QuoteConventionSet( new List { new QuoteConvention( diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index 8f886ac5..f2f783fb 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -1,9 +1,10 @@ +using System; using System.Collections.Generic; using SIL.Machine.Corpora; namespace SIL.Machine.PunctuationAnalysis { - public class TextSegment + public class TextSegment : IEquatable { public string Text { get; private set; } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } @@ -40,18 +41,23 @@ public TextSegment(string text) public override bool Equals(object obj) { - if (!(obj is TextSegment t)) + if (!(obj is TextSegment other)) { return false; } - return Text.Equals(t.Text) - && IndexInVerse.Equals(t.IndexInVerse) - && NumSegmentsInVerse.Equals(t.NumSegmentsInVerse) + return Equals(other); + } + + public bool Equals(TextSegment other) + { + return Text.Equals(other.Text) + && IndexInVerse.Equals(other.IndexInVerse) + && NumSegmentsInVerse.Equals(other.NumSegmentsInVerse) && ( - (UsfmToken == null && t.UsfmToken == null) - || (UsfmToken != null && t.UsfmToken != null && UsfmToken.Equals(t.UsfmToken)) + (UsfmToken == null && other.UsfmToken == null) + || (UsfmToken != null && other.UsfmToken != null && UsfmToken.Equals(other.UsfmToken)) ) - && ImmediatePrecedingMarker.Equals(t.ImmediatePrecedingMarker); + && ImmediatePrecedingMarker.Equals(other.ImmediatePrecedingMarker); } public override int GetHashCode() diff --git a/src/SIL.Machine/PunctuationAnalysis/Verse.cs b/src/SIL.Machine/PunctuationAnalysis/Verse.cs index 47b55766..2f5364eb 100644 --- a/src/SIL.Machine/PunctuationAnalysis/Verse.cs +++ b/src/SIL.Machine/PunctuationAnalysis/Verse.cs @@ -5,7 +5,7 @@ namespace SIL.Machine.PunctuationAnalysis { public class Verse { - public List TextSegments { get; private set; } + public IReadOnlyList TextSegments { get; private set; } public Verse(List textSegments) { diff --git a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs index a323dbc5..6707de96 100644 --- a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs @@ -9,9 +9,7 @@ public class FallbackQuotationMarkResolverTests [Test] public void Reset() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -36,9 +34,7 @@ public void Reset() [Test] public void SimpleQuotationMarkResolutionWithNoPreviousMark() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -68,9 +64,7 @@ [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \" text"). [Test] public void SimpleQuotationMarkResolutionWithPreviousOpeningMark() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -111,9 +105,7 @@ public void SimpleQuotationMarkResolutionWithPreviousOpeningMark() [Test] public void SimpleQuotationMarkResolutionWithPreviousClosingMark() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -154,9 +146,7 @@ public void SimpleQuotationMarkResolutionWithPreviousClosingMark() [Test] public void IsOpeningQuote() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -199,9 +189,7 @@ public void IsOpeningQuote() [Test] public void IsOpeningQuoteWithUnambiguousQuoteConvention() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -228,9 +216,7 @@ public void IsOpeningQuoteWithUnambiguousQuoteConvention() [Test] public void IsOpeningQuoteStateful() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -260,9 +246,7 @@ public void IsOpeningQuoteStateful() [Test] public void DoesMostRecentOpeningMarkImmediatelyPrecede() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -330,9 +314,7 @@ public void DoesMostRecentOpeningMarkImmediatelyPrecede() [Test] public void IsClosingQuote() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -379,9 +361,7 @@ public void IsClosingQuote() [Test] public void IsClosingQuoteWithUnambiguousQuoteConvention() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -408,9 +388,7 @@ public void IsClosingQuoteWithUnambiguousQuoteConvention() [Test] public void ResolveOpeningQuote() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -435,9 +413,7 @@ public void ResolveOpeningQuote() [Test] public void ResolveClosingQuote() { - var englishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs index 73f27655..b8c30b08 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -31,9 +31,7 @@ of the field which Yahweh God had made. \v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” "; - var standardEnglishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(standardEnglishQuoteConvention); var quotationMarkDenormalizationFirstPass = new QuotationMarkDenormalizationFirstPass( diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs index 599e2699..69d4f9ce 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs @@ -488,7 +488,7 @@ public void AssertUsfmEqual(string observedUsfm, string expectedUsfm) public QuoteConvention GetQuoteConventionByName(string name) { - QuoteConvention quoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName(name); + QuoteConvention quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); Assert.IsNotNull(quoteConvention); return quoteConvention; } diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs index e7922cec..e4dc004b 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs @@ -680,14 +680,10 @@ public List RunFirstPass( string targetQuoteConventionName ) { - var sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - sourceQuoteConventionName - ); + var sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(sourceQuoteConventionName); Assert.IsNotNull(sourceQuoteConvention); - var targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - targetQuoteConventionName - ); + var targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(targetQuoteConventionName); Assert.IsNotNull(targetQuoteConvention); var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); @@ -702,14 +698,10 @@ public QuotationMarkUpdateStrategy RunFirstPassOnChapter( string targetQuoteConventionName ) { - var sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - sourceQuoteConventionName - ); + var sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(sourceQuoteConventionName); Assert.IsNotNull(sourceQuoteConvention); - var targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - targetQuoteConventionName - ); + var targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(targetQuoteConventionName); Assert.IsNotNull(targetQuoteConvention); var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); @@ -723,7 +715,7 @@ string targetQuoteConventionName public QuoteConvention GetQuoteConventionByName(string name) { - var quoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName(name); + var quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); Assert.IsNotNull(quoteConvention); return quoteConvention; } diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs index 94f73444..a4a5095c 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -499,7 +499,7 @@ public void ProcessScriptureElement() CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "british_english") ); var quotationMarkFinder = new MockQuotationMarkFinder(); - quoteConventionChanger.QuotationMarkFinder = quotationMarkFinder; + quoteConventionChanger.InternalQuotationMarkFinder = quotationMarkFinder; var updateElement = new UsfmUpdateBlockElement( UsfmUpdateBlockElementType.Text, @@ -772,17 +772,17 @@ public void CheckForChapterChange() CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); - Assert.That(quoteConventionChanger.CurrentChapterNumber, Is.EqualTo(0)); + Assert.That(quoteConventionChanger.InternalCurrentChapterNumber, Is.EqualTo(0)); quoteConventionChanger.InternalCheckForChapterChange(new UsfmUpdateBlock([ScriptureRef.Parse("MAT 1:1")], [])); - Assert.That(quoteConventionChanger.CurrentChapterNumber, Is.EqualTo(1)); + Assert.That(quoteConventionChanger.InternalCurrentChapterNumber, Is.EqualTo(1)); quoteConventionChanger.InternalCheckForChapterChange( new UsfmUpdateBlock([ScriptureRef.Parse("ISA 15:22")], []) ); - Assert.That(quoteConventionChanger.CurrentChapterNumber, Is.EqualTo(15)); + Assert.That(quoteConventionChanger.InternalCurrentChapterNumber, Is.EqualTo(15)); } [Test] @@ -803,28 +803,31 @@ public void StartNewChapter() ) ); - quoteConventionChanger.VerseTextQuotationMarkResolver = new MockQuotationMarkResolver(); + quoteConventionChanger.InternalVerseTextQuotationMarkResolver = new MockQuotationMarkResolver(); quoteConventionChanger - .NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed) + .InternalNextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed) .SetText("this text should be erased"); - quoteConventionChanger.VerseTextQuotationMarkResolver.InternalIssues.Add( + quoteConventionChanger.InternalVerseTextQuotationMarkResolver.InternalIssues.Add( QuotationMarkResolutionIssue.IncompatibleQuotationMark ); quoteConventionChanger.InternalStartNewChapter(1); - var segment = quoteConventionChanger.NextScriptureTextSegmentBuilder.Build(); - Assert.That(quoteConventionChanger.CurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.Skip)); + var segment = quoteConventionChanger.InternalNextScriptureTextSegmentBuilder.Build(); + Assert.That(quoteConventionChanger.InternalCurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.Skip)); Assert.That(segment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Chapter)); Assert.That(segment.Text, Is.EqualTo("")); Assert.That(!segment.MarkersInPrecedingContext.Contains(UsfmMarkerType.Embed)); - Assert.That(quoteConventionChanger.VerseTextQuotationMarkResolver.InternalIssues, Has.Count.EqualTo(0)); + Assert.That(quoteConventionChanger.InternalVerseTextQuotationMarkResolver.InternalIssues, Has.Count.EqualTo(0)); quoteConventionChanger.InternalStartNewChapter(2); - Assert.That(quoteConventionChanger.CurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + Assert.That(quoteConventionChanger.InternalCurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); quoteConventionChanger.InternalStartNewChapter(3); - Assert.That(quoteConventionChanger.CurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback)); + Assert.That( + quoteConventionChanger.InternalCurrentStrategy, + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); } private static string ChangeQuotationMarks( @@ -856,14 +859,10 @@ private static MockQuoteConventionChangingUsfmUpdateBlockHandler CreateQuoteConv ) { quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); - var sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - sourceQuoteConventionName - ); + var sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(sourceQuoteConventionName); Assert.IsNotNull(sourceQuoteConvention); - var targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - targetQuoteConventionName - ); + var targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(targetQuoteConventionName); Assert.IsNotNull(targetQuoteConvention); return new MockQuoteConventionChangingUsfmUpdateBlockHandler( @@ -885,34 +884,34 @@ private class MockQuoteConventionChangingUsfmUpdateBlockHandler( QuotationMarkUpdateSettings settings ) : QuoteConventionChangingUsfmUpdateBlockHandler(sourceQuoteConvention, targetQuoteConvention, settings) { - public QuotationMarkFinder QuotationMarkFinder + public QuotationMarkFinder InternalQuotationMarkFinder { - set => _quotationMarkFinder = value; + set => QuotationMarkFinder = value; } - public TextSegment.Builder NextScriptureTextSegmentBuilder + public TextSegment.Builder InternalNextScriptureTextSegmentBuilder { - get => _nextScriptureTextSegmentBuilder; + get => NextScriptureTextSegmentBuilder; } - public MockQuotationMarkResolver VerseTextQuotationMarkResolver + public MockQuotationMarkResolver InternalVerseTextQuotationMarkResolver { get => - _verseTextQuotationMarkResolver is MockQuotationMarkResolver mqmr + VerseTextQuotationMarkResolver is MockQuotationMarkResolver mqmr ? mqmr : throw new InvalidOperationException( "Unable to use implementations of IQuotationMarkResolver other than MockQuotationMarkResolver" ); - set => _verseTextQuotationMarkResolver = value; + set => VerseTextQuotationMarkResolver = value; } - public int CurrentChapterNumber + public int InternalCurrentChapterNumber { - get => _currentChapterNumber; - set => _currentChapterNumber = value; + get => CurrentChapterNumber; + set => CurrentChapterNumber = value; } - public QuotationMarkUpdateStrategy CurrentStrategy + public QuotationMarkUpdateStrategy InternalCurrentStrategy { - get => _currentStrategy; - set => _currentStrategy = value; + get => CurrentStrategy; + set => CurrentStrategy = value; } public void InternalProcessScriptureElement( @@ -966,7 +965,7 @@ public MockQuotationMarkFinder() } public override List FindAllPotentialQuotationMarksInTextSegments( - List textSegments + IReadOnlyList textSegments ) { NumTimesCalled++; @@ -985,12 +984,12 @@ private class MockQuotationMarkResolver(IQuotationMarkResolutionSettings? settin public override void Reset() { - base.Reset(); + Reset(); NumTimesCalled = 0; } public override IEnumerable ResolveQuotationMarks( - List quoteMatches + IReadOnlyList quoteMatches ) { NumTimesCalled++; diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs index d3689202..7eeda9ec 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -344,7 +344,7 @@ public void AddQuotationContinuer() [Test] public void IsEnglishQuotationContinuer() { - var standardEnglish = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english"); + var standardEnglish = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(standardEnglish); var settings = new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([standardEnglish])); @@ -645,9 +645,7 @@ public void IsEnglishQuotationContinuer() [Test] public void IsSpanishQuotationContinuer() { - var westernEuropeanQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "western_european" - ); + var westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("western_european"); Assert.IsNotNull(westernEuropeanQuoteConvention); var settings = new QuoteConventionDetectionResolutionSettings( @@ -950,9 +948,7 @@ public void IsSpanishQuotationContinuer() [Test] public void IsOpeningQuote() { - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -965,9 +961,7 @@ public void IsOpeningQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") - ); + var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -978,9 +972,7 @@ public void IsOpeningQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -1292,9 +1284,7 @@ public void IsOpeningQuote() [Test] public void IsClosingQuote() { - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -1307,9 +1297,7 @@ public void IsClosingQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") - ); + var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -1320,9 +1308,7 @@ public void IsClosingQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -1333,9 +1319,7 @@ public void IsClosingQuote() quotationContinuerState ); - var standardFrenchQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_french") - ); + var standardFrenchQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_french")); Assert.IsNotNull(standardFrenchQuoteConvention); var standardFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardFrenchQuoteConvention]) @@ -1564,9 +1548,7 @@ public void IsClosingQuote() [Test] public void IsMalformedOpeningQuote() { - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -1579,9 +1561,7 @@ public void IsMalformedOpeningQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") - ); + var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -1592,9 +1572,7 @@ public void IsMalformedOpeningQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -1869,9 +1847,7 @@ public void IsMalformedOpeningQuote() [Test] public void IsMalformedClosingQuote() { - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -1884,9 +1860,7 @@ public void IsMalformedClosingQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") - ); + var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -1897,9 +1871,7 @@ public void IsMalformedClosingQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -2151,9 +2123,7 @@ public void IsMalformedClosingQuote() [Test] public void IsUnpairedClosingQuote() { - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -2166,9 +2136,7 @@ public void IsUnpairedClosingQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("british_english") - ); + var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -2179,9 +2147,7 @@ public void IsUnpairedClosingQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -2430,9 +2396,7 @@ public void IsUnpairedClosingQuote() [Test] public void IsApostrophe() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2446,7 +2410,7 @@ public void IsApostrophe() ); var typewriterEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( @@ -2699,9 +2663,7 @@ public void IsApostrophe() [Test] public void DepthBasedQuotationMarkResolverReset() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2745,9 +2707,7 @@ [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201cThis is a [Test] public void BasicQuotationMarkRecognition() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2780,9 +2740,7 @@ public void BasicQuotationMarkRecognition() [Test] public void ResolutionOnlyOfPassedMatches() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2820,9 +2778,7 @@ [new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegm [Test] public void ResolutionAcrossSegments() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2856,9 +2812,7 @@ public void ResolutionAcrossSegments() [Test] public void ResolutionWithApostrophes() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2894,7 +2848,7 @@ public void ResolutionWithApostrophes() Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); var typewriterEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( @@ -2934,9 +2888,7 @@ public void ResolutionWithApostrophes() [Test] public void EnglishQuoteContinuers() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2979,9 +2931,7 @@ public void EnglishQuoteContinuers() [Test] public void SpanishQuoteContinuers() { - var westernEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("western_european") - ); + var westernEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("western_european")); Assert.IsNotNull(westernEuropeanQuoteConvention); var westernEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([westernEuropeanQuoteConvention]) @@ -3024,9 +2974,7 @@ public void SpanishQuoteContinuers() [Test] public void MalformedQuotationMarks() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -3065,9 +3013,7 @@ public void MalformedQuotationMarks() [Test] public void UnpairedQuotationMarkIssue() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -3116,9 +3062,7 @@ [new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegm [Test] public void TooDeepNestingIssue() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -3160,9 +3104,7 @@ public void TooDeepNestingIssue() [Test] public void IncompatibleQuotationMarkIssue() { - var standardEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_english") - ); + var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -3200,7 +3142,7 @@ public void IncompatibleQuotationMarkIssue() public void AmbiguousQuotationMarkIssue() { var typewriterEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( @@ -3242,7 +3184,7 @@ public void AmbiguousQuotationMarkIssue() public void TypewriterEnglishQuotationMarkRecognition() { var typewriterEnglishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_english") + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( @@ -3283,9 +3225,7 @@ public void TypewriterEnglishQuotationMarkRecognition() [Test] public void TypewriterFrenchMarkRecognition() { - var typewriterFrenchQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_french") - ); + var typewriterFrenchQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("typewriter_french")); Assert.IsNotNull(typewriterFrenchQuoteConvention); var typewriterFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([typewriterFrenchQuoteConvention]) @@ -3320,9 +3260,7 @@ public void TypewriterFrenchMarkRecognition() [Test] public void CentralEuropeanQuotationMarkRecognition() { - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -3360,9 +3298,7 @@ public void CentralEuropeanQuotationMarkRecognition() [Test] public void StandardSwedishQuotationMarkRecognition() { - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -3400,20 +3336,14 @@ public void StandardSwedishQuotationMarkRecognition() [Test] public void MultipleConventionsQuotationMarkRecognition() { - var typewriterFrenchQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "typewriter_french" - ); + var typewriterFrenchQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("typewriter_french"); Assert.IsNotNull(typewriterFrenchQuoteConvention); - var centralEuropeanQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("central_european") - ); + var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); Assert.IsNotNull(centralEuropeanQuoteConvention); - var standardSwedishQuoteConvention = ( - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("standard_swedish") - ); + var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); Assert.IsNotNull(standardSwedishQuoteConvention); var multipleConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet( diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index a1cbdc5c..b1b3f407 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -8,7 +8,7 @@ public class QuotationMarkFinderTests [Test] public void ThatAllPossibleQuotationMarksAreIdentified() { - var quotationMarkFinder = new QuotationMarkFinder(StandardQuoteConventions.QuoteConventions); + var quotationMarkFinder = new QuotationMarkFinder(QuoteConventions.Standard); Assert.That( quotationMarkFinder .FindAllPotentialQuotationMarksInTextSegment( @@ -287,9 +287,7 @@ public void ThatAllPossibleQuotationMarksAreIdentified() [Test] public void ThatItUsesTheQuoteConventionSet() { - var standardEnglishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "standard_english" - ); + var standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(standardEnglishQuoteConvention); var englishQuotationMarkFinder = new QuotationMarkFinder( @@ -306,9 +304,7 @@ public void ThatItUsesTheQuoteConventionSet() Has.Count.EqualTo(0) ); - var typewriterEnglishQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "typewriter_english" - ); + var typewriterEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english"); Assert.IsNotNull(typewriterEnglishQuoteConvention); var typewriterEnglishQuotationMarkFinder = new QuotationMarkFinder( @@ -334,9 +330,7 @@ public void ThatItUsesTheQuoteConventionSet() ) ); - var westernEuropeanQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "western_european" - ); + var westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("western_european"); Assert.IsNotNull(westernEuropeanQuoteConvention); var westernEuropeanQuotationMarkFinder = new QuotationMarkFinder( @@ -362,8 +356,9 @@ public void ThatItUsesTheQuoteConventionSet() ) ); - var typewriterWesternEuropeanQuoteConvention = - StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName("typewriter_western_european"); + var typewriterWesternEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "typewriter_western_european" + ); Assert.IsNotNull(typewriterWesternEuropeanQuoteConvention); var typewriterWesternEuropeanQuotationMarkFinder = new QuotationMarkFinder( @@ -396,9 +391,7 @@ public void ThatItUsesTheQuoteConventionSet() ) ); - var centralEuropeanQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( - "central_european" - ); + var centralEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("central_european"); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanQuotationMarkFinder = new QuotationMarkFinder( diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs index 3fca092d..c8fec262 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkMetadataTests.cs @@ -74,7 +74,7 @@ public void UpdateQuotationMarkWithMultiCharacterQuotationMarks() public QuoteConvention GetQuoteConventionByName(string name) { - QuoteConvention quoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName(name); + QuoteConvention quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); Assert.IsNotNull(quoteConvention); return quoteConvention; } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs index e75bf30f..54dafd8c 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkResolverTests.cs @@ -9,18 +9,18 @@ public class QuotationMarkResolverTests public void Reset() { DepthBasedQuotationMarkResolver quotationMarkResolver = new DepthBasedQuotationMarkResolver( - new QuoteConventionDetectionResolutionSettings(StandardQuoteConventions.QuoteConventions) + new QuoteConventionDetectionResolutionSettings(QuoteConventions.Standard) ); - Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations, Has.Count.EqualTo(0)); - Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks, Has.Count.EqualTo(0)); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations.Count(), Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks.Count(), Is.EqualTo(0)); Assert.That(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth, Is.EqualTo(0)); Assert.That(quotationMarkResolver.QuoteContinuerState.CurrentDepth, Is.EqualTo(0)); quotationMarkResolver.Reset(); - Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations, Has.Count.EqualTo(0)); - Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks, Has.Count.EqualTo(0)); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations.Count(), Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks.Count(), Is.EqualTo(0)); Assert.That(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth, Is.EqualTo(0)); Assert.That(quotationMarkResolver.QuoteContinuerState.CurrentDepth, Is.EqualTo(0)); @@ -39,13 +39,13 @@ public void Reset() ]; quotationMarkResolver.ResolveQuotationMarks(quotationMarkStringMatches).ToList(); - Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations, Has.Count.GreaterThan(0)); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations.Count(), Is.GreaterThan(0)); Assert.IsTrue(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth > 0); quotationMarkResolver.Reset(); - Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations, Has.Count.EqualTo(0)); - Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks, Has.Count.EqualTo(0)); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations.Count(), Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks.Count(), Is.EqualTo(0)); Assert.That(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth, Is.EqualTo(0)); Assert.That(quotationMarkResolver.QuoteContinuerState.CurrentDepth, Is.EqualTo(0)); } From 98d89b79b70ab7c94206e391cd524609d6a0fc97 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 6 Aug 2025 14:22:04 -0400 Subject: [PATCH 23/28] Fix typo --- .../QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs index a4a5095c..1cccbdf8 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -984,7 +984,7 @@ private class MockQuotationMarkResolver(IQuotationMarkResolutionSettings? settin public override void Reset() { - Reset(); + base.Reset(); NumTimesCalled = 0; } From 6984ad935748e729be1aecb1e8582a8e6a553d12 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 6 Aug 2025 15:52:35 -0400 Subject: [PATCH 24/28] Remove debugging line --- .../PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs index 7eeda9ec..4b9562f2 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -1,4 +1,3 @@ -using System.Unicode; using NUnit.Framework; namespace SIL.Machine.PunctuationAnalysis; @@ -2512,7 +2511,6 @@ public void IsApostrophe() null ) ); - var charInfo = UnicodeInfo.GetCharInfo('ℵ'); Assert.IsFalse( standardEnglishQuotationMarkCategorizer.IsApostrophe( From 5788e3ad59be98165019c202e0041daae60df94a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 6 Aug 2025 15:58:43 -0400 Subject: [PATCH 25/28] Make standard qcs static --- src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs index 6a3633c6..5720198e 100644 --- a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs +++ b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs @@ -2,7 +2,7 @@ namespace SIL.Machine.PunctuationAnalysis { - public class QuoteConventions + public static class QuoteConventions { public static readonly QuoteConventionSet Standard = new QuoteConventionSet( new List From a92dae85ab46ea7250a0febb38c460eae8ad933a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 7 Aug 2025 14:20:01 -0400 Subject: [PATCH 26/28] Use PCRE.NET to mirror python regexes --- .../QuotationMarkFinder.cs | 35 ++-------- .../QuotationMarkStringMatch.cs | 70 +++++-------------- src/SIL.Machine/SIL.Machine.csproj | 2 +- 3 files changed, 24 insertions(+), 83 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index d51aa820..d5602ed3 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -1,13 +1,12 @@ using System.Collections.Generic; -using System.Globalization; using System.Linq; -using System.Text.RegularExpressions; +using PCRE; namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkFinder { - private static readonly Regex TypewriterGuillemetsPattern = new Regex(@"(<<|>>|<|>)", RegexOptions.Compiled); + private static readonly PcreRegex QuotationMarkPattern = new PcreRegex(@"(\p{Quotation_Mark}|<<|>>|<|>)"); private readonly QuoteConventionSet _quoteConventions; public QuotationMarkFinder(QuoteConventionSet quoteConventions) @@ -37,30 +36,9 @@ IReadOnlyList textSegments public List FindAllPotentialQuotationMarksInTextSegment(TextSegment textSegment) { - TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(textSegment.Text); - int index = 0; - List quotationMarkStringMatches = new List(); - while (charactersEnumerator.MoveNext()) - { - string currentCharacterString = charactersEnumerator.Current.ToString(); - if ( - ( - QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "QUOTATION MARK") - || QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "APOSTROPHE") - ) - && ( - _quoteConventions.IsValidOpeningQuotationMark(currentCharacterString) - || _quoteConventions.IsValidClosingQuotationMark(currentCharacterString) - ) - ) - { - quotationMarkStringMatches.Add(new QuotationMarkStringMatch(textSegment, index, index + 1)); - } - index++; - } - List typewriterGuillemetMatches = TypewriterGuillemetsPattern + return QuotationMarkPattern .Matches(textSegment.Text) - .Cast() + .Cast() .Where(match => _quoteConventions.IsValidOpeningQuotationMark(match.Groups[0].Value) || _quoteConventions.IsValidClosingQuotationMark(match.Groups[0].Value) @@ -71,11 +49,6 @@ public List FindAllPotentialQuotationMarksInTextSegmen m.Groups[0].Index + m.Groups[0].Length )) .ToList(); - - return quotationMarkStringMatches - .Concat(typewriterGuillemetMatches) - .OrderBy(match => match.StartIndex) - .ToList(); } } } diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs index 448aed45..1dffa148 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -1,13 +1,14 @@ using System; using System.Globalization; using System.Text.RegularExpressions; -using System.Unicode; +using PCRE; namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkStringMatch { - // No LatinLetterPattern or LetterPattern because C# does not support it in the same way as Python. Using UnicodeInfo to mirror machine.py + private static readonly PcreRegex LetterPattern = new PcreRegex(@"[\p{L}\N{U+0001E200}-\N{U+0001E28F}]"); + private static readonly PcreRegex LatinLetterPattern = new PcreRegex(@"^\p{Script_Extensions=Latin}$"); private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); @@ -55,9 +56,15 @@ public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventions) => public bool NextCharacterMatches(Regex regexPattern) => NextCharacter != null && regexPattern.IsMatch(NextCharacter); + public bool NextCharacterMatches(PcreRegex regexPattern) => + NextCharacter != null && regexPattern.IsMatch(NextCharacter); + public bool PreviousCharacterMatches(Regex regexPattern) => PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); + public bool PreviousCharacterMatches(PcreRegex regexPattern) => + PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); + public string PreviousCharacter { get @@ -98,9 +105,15 @@ public string NextCharacter public bool LeadingSubstringMatches(Regex regexPattern) => regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); + public bool LeadingSubstringMatches(PcreRegex regexPattern) => + regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); + public bool TrailingSubstringMatches(Regex regexPattern) => regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); + public bool TrailingSubstringMatches(PcreRegex regexPattern) => + regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); + // This assumes that the two matches occur in the same verse public bool Precedes(QuotationMarkStringMatch other) { @@ -151,72 +164,27 @@ public bool HasTrailingPunctuation() public bool HasLetterInLeadingSubstring() { - string leadingSubstring = TextSegment.SubstringBefore(StartIndex); - if (leadingSubstring.Length == 0) - return false; - - TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(leadingSubstring); - while (charactersEnumerator.MoveNext()) - { - if (!IsLetter(charactersEnumerator.Current.ToString())) - return false; - } - return true; + return LeadingSubstringMatches(LetterPattern); } public bool HasLetterInTrailingSubstring() { - string trailingSubstring = TextSegment.SubstringAfter(EndIndex); - if (trailingSubstring.Length == 0) - return false; - TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(trailingSubstring); - while (charactersEnumerator.MoveNext()) - { - if (!IsLetter(charactersEnumerator.Current.ToString())) - return false; - } - return true; + return TrailingSubstringMatches(LetterPattern); } public bool HasLeadingLatinLetter() { - return PreviousCharacter != null && IsLatinScript(PreviousCharacter); + return PreviousCharacterMatches(LatinLetterPattern); } public bool HasTrailingLatinLetter() { - return NextCharacter != null && IsLatinScript(NextCharacter); + return NextCharacterMatches(LatinLetterPattern); } public bool HasQuoteIntroducerInLeadingSubstring() { return LeadingSubstringMatches(QuoteIntroducerPattern); } - - public static bool HasUnicodeProperty(string characterString, string attribute) - { - if (characterString.Length == 1) - { - return UnicodeInfo.GetName(characterString[0]).Contains(attribute); - } - else if (char.IsSurrogatePair(characterString[0], characterString[1])) - { - //Get true unicode value - int combinedCharacterValue = - (((int)characterString[0] - 0xD800) * 0x400) + ((int)characterString[1] - 0xDC00) + 0x10000; - return UnicodeInfo.GetName(combinedCharacterValue).Contains(attribute); - } - return false; - } - - private bool IsLatinScript(string characterString) - { - return HasUnicodeProperty(characterString, "LATIN"); - } - - private bool IsLetter(string characterString) - { - return HasUnicodeProperty(characterString, "LETTER"); - } } } diff --git a/src/SIL.Machine/SIL.Machine.csproj b/src/SIL.Machine/SIL.Machine.csproj index 91a88c5d..0307d374 100644 --- a/src/SIL.Machine/SIL.Machine.csproj +++ b/src/SIL.Machine/SIL.Machine.csproj @@ -38,12 +38,12 @@ + - From 61428af008dcd81aa92d52bab38e50f2ae9a3ccc Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 7 Aug 2025 14:34:44 -0400 Subject: [PATCH 27/28] Use explicit type rather than var --- .../FallbackQuotationMarkResolverTests.cs | 28 +-- .../Corpora/QuotationDenormalizationTests.cs | 13 +- ...ormalizationUsfmBlockUpdateHandlerTests.cs | 134 ++++++------ .../QuotationMarkUpdateFirstPassTests.cs | 138 ++++++++---- ...tionChangingUsfmBlockUpdateHandlerTests.cs | 142 +++++++------ .../Corpora/UpdateUsfmParserHandlerTests.cs | 88 ++++---- .../DepthBasedQuotationMarkResolverTests.cs | 196 ++++++++++++------ .../QuotationConventionDetectorTests.cs | 88 ++++---- .../QuotationMarkFinderTests.cs | 18 +- .../QuotationMarkStringMatchTests.cs | 2 +- .../QuoteConventionSetTests.cs | 2 +- .../QuoteConventionTests.cs | 47 +++-- .../PunctuationAnalysis/TextSegmentTests.cs | 73 +++---- .../UsfmStructureExtractorTests.cs | 26 +-- 14 files changed, 579 insertions(+), 416 deletions(-) diff --git a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs index 6707de96..09462018 100644 --- a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs @@ -9,7 +9,7 @@ public class FallbackQuotationMarkResolverTests [Test] public void Reset() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -34,7 +34,7 @@ public void Reset() [Test] public void SimpleQuotationMarkResolutionWithNoPreviousMark() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -64,7 +64,7 @@ [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \" text"). [Test] public void SimpleQuotationMarkResolutionWithPreviousOpeningMark() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -105,7 +105,7 @@ public void SimpleQuotationMarkResolutionWithPreviousOpeningMark() [Test] public void SimpleQuotationMarkResolutionWithPreviousClosingMark() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -146,7 +146,7 @@ public void SimpleQuotationMarkResolutionWithPreviousClosingMark() [Test] public void IsOpeningQuote() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -189,7 +189,7 @@ public void IsOpeningQuote() [Test] public void IsOpeningQuoteWithUnambiguousQuoteConvention() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -216,7 +216,7 @@ public void IsOpeningQuoteWithUnambiguousQuoteConvention() [Test] public void IsOpeningQuoteStateful() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -246,7 +246,7 @@ public void IsOpeningQuoteStateful() [Test] public void DoesMostRecentOpeningMarkImmediatelyPrecede() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -314,7 +314,7 @@ public void DoesMostRecentOpeningMarkImmediatelyPrecede() [Test] public void IsClosingQuote() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -361,7 +361,7 @@ public void IsClosingQuote() [Test] public void IsClosingQuoteWithUnambiguousQuoteConvention() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -388,7 +388,7 @@ public void IsClosingQuoteWithUnambiguousQuoteConvention() [Test] public void ResolveOpeningQuote() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -403,7 +403,7 @@ public void ResolveOpeningQuote() 0, 1 ); - var actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveOpeningMark( + QuotationMarkMetadata actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveOpeningMark( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1) ); Assert.That(actualResolvedQuotationMark, Is.EqualTo(expectedResolvedQuotationMark)); @@ -413,7 +413,7 @@ public void ResolveOpeningQuote() [Test] public void ResolveClosingQuote() { - var englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(englishQuoteConvention); var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( @@ -428,7 +428,7 @@ public void ResolveClosingQuote() 10, 11 ); - var actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveClosingMark( + QuotationMarkMetadata actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveClosingMark( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 10, 11) ); Assert.That(actualResolvedQuotationMark, Is.EqualTo(expectedResolvedQuotationMark)); diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs index b8c30b08..6b9fcfdc 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -9,7 +9,7 @@ public class QuotationDenormalizationTests [Test] public void FullQuotationDenormalizationPipeline() { - var normalizedUsfm = + string normalizedUsfm = @" \id GEN \c 1 @@ -23,7 +23,7 @@ of the field which Yahweh God had made. God has said, 'You shall not eat of it. You shall not touch it, lest you die.'"" "; - var expectedDenormalizedUsfm = + string expectedDenormalizedUsfm = @"\id GEN \c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” @@ -31,7 +31,9 @@ of the field which Yahweh God had made. \v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” "; - var standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "standard_english" + ); Assert.IsNotNull(standardEnglishQuoteConvention); var quotationMarkDenormalizationFirstPass = new QuotationMarkDenormalizationFirstPass( @@ -40,7 +42,8 @@ of the field which Yahweh God had made. ); UsfmParser.Parse(normalizedUsfm, quotationMarkDenormalizationFirstPass); - var bestChapterStrategies = quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); + List bestChapterStrategies = + quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); var quotationMarkDenormalizer = new QuotationMarkDenormalizationUsfmUpdateBlockHandler( standardEnglishQuoteConvention, @@ -51,7 +54,7 @@ of the field which Yahweh God had made. var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer]); UsfmParser.Parse(normalizedUsfm, updater); - var actualDenormalizedUsfm = updater.GetUsfm(); + string actualDenormalizedUsfm = updater.GetUsfm(); Assert.That(actualDenormalizedUsfm, Is.EqualTo(expectedDenormalizedUsfm).IgnoreLineEndings()); } diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs index 69d4f9ce..c265d36f 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs @@ -17,34 +17,34 @@ of the field which Yahweh God had made. [Test] public void SimpleEnglishQuoteDenormalization() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_english"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleBritishEnglishQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, 'Has God really said, ""You shall not eat of any tree of the garden""?' "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "british_english", "british_english"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "british_english", "british_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); // no denormalization should be needed for this example @@ -53,14 +53,14 @@ of the field which Yahweh God had made. [Test] public void SimpleTypewriterEnglishQuoteDenormalization() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "typewriter_english"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "typewriter_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); // some of the quotes shouldn't need to be denormalized @@ -69,14 +69,18 @@ public void SimpleTypewriterEnglishQuoteDenormalization() [Test] public void SimpleHybridTypewriterEnglishQuoteDenormalization() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "hybrid_typewriter_english"); + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "hybrid_typewriter_english" + ); AssertUsfmEqual(observedUsfm, expectedUsfm); // the single guillemets shouldn't need to be denormalized @@ -86,20 +90,20 @@ public void SimpleHybridTypewriterEnglishQuoteDenormalization() [Test] public void SimpleFrenchQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ‹You shall not eat of any tree of the garden›?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_french", "standard_french"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_french", "standard_french"); AssertUsfmEqual(observedUsfm, expectedUsfm); // the unusual quotation marks shouldn't need to be denormalized @@ -108,20 +112,20 @@ of the field which Yahweh God had made. [Test] public void SimpleTypewriterFrenchQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, <?>> "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, <?>>" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_french", "typewriter_french"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_french", "typewriter_french"); AssertUsfmEqual(observedUsfm, expectedUsfm); // the 1st- and 2nd-level quotes are denormalized to identical marks @@ -130,40 +134,40 @@ of the field which Yahweh God had made. [Test] public void SimpleWesternEuropeanQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ""You shall not eat of any tree of the garden""?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "western_european", "western_european"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "western_european", "western_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleTypewriterWesternEuropeanQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, <> "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, <>" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "typewriter_western_european", "typewriter_western_european" @@ -174,20 +178,20 @@ of the field which Yahweh God had made. [Test] public void SimpleTypewriterWesternEuropeanVariantQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, \"Has God really said, ?\"" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "typewriter_western_european_variant", "typewriter_western_european_variant" @@ -198,20 +202,20 @@ of the field which Yahweh God had made. [Test] public void SimpleHybridTypewriterWesternEuropeanQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ""You shall not eat of any tree of the garden""?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, «Has God really said, \"You shall not eat of any tree of the garden\"?»" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "hybrid_typewriter_western_european", "hybrid_typewriter_western_european" @@ -222,40 +226,40 @@ of the field which Yahweh God had made. [Test] public void SimpleCentralEuropeanQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ""You shall not eat of any tree of the garden""?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european", "central_european"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european", "central_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleCentralEuropeanGuillemetsQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ›You shall not eat of any tree of the garden‹?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "central_european_guillemets", "central_european_guillemets" @@ -266,96 +270,96 @@ of the field which Yahweh God had made. [Test] public void SimpleSwedishQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, 'You shall not eat of any tree of the garden'?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_swedish", "standard_swedish"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_swedish", "standard_swedish"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleFinnishQuoteDenormalization() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_finnish"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_finnish"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleEasternEuropeanQuoteDenormalization() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "eastern_european"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "eastern_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleRussianQuoteDenormalization() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ""You shall not eat of any tree of the garden""?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_russian", "standard_russian"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_russian", "standard_russian"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleArabicQuoteDenormalization() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" ); - var observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_arabic"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_arabic"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void FallbackQuotationDenormalizationSameAsFull() { - var normalizedUsfm = SimpleNormalizedUsfm; - var expectedUsfm = ( + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", "standard_english", @@ -367,20 +371,20 @@ public void FallbackQuotationDenormalizationSameAsFull() [Test] public void FallbackQuotationDenormalizationIncorrectlyNested() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, ""You shall not eat of any tree of the garden""?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", "standard_english", @@ -392,20 +396,20 @@ of the field which Yahweh God had made. [Test] public void FallbackQuotationDenormalizationIncorrectlyNestedSecondCase() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, 'Has God really said, ""You shall not eat of any tree of the garden""?' "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", "standard_english", @@ -417,20 +421,20 @@ of the field which Yahweh God had made. [Test] public void FallbackQuotationDenormalizationUnclosedQuote() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, You shall not eat of any tree of the garden'?"" "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" ); - var observedUsfm = DenormalizeQuotationMarks( + string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", "standard_english", @@ -468,8 +472,8 @@ public QuotationMarkDenormalizationUsfmUpdateBlockHandler CreateQuotationDenorma ) { quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); - var sourceQuoteConvention = GetQuoteConventionByName(sourceQuoteConventionName); - var targetQuoteConvention = GetQuoteConventionByName(targetQuoteConventionName); + QuoteConvention sourceQuoteConvention = GetQuoteConventionByName(sourceQuoteConventionName); + QuoteConvention targetQuoteConvention = GetQuoteConventionByName(targetQuoteConventionName); return new QuotationMarkDenormalizationUsfmUpdateBlockHandler( sourceQuoteConvention, diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs index e4dc004b..2f4ba189 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs @@ -204,7 +204,7 @@ public void CheckWhetherFallbackModeWillWorkWithNormalizedConventions() public void ChooseBestActionForChapter() { // Verse text with no issues - var actualAction = RunFirstPassOnChapter( + QuotationMarkUpdateStrategy actualAction = RunFirstPassOnChapter( [ "Now the serpent was more subtle than any animal " + "of the field which Yahweh God had made. " @@ -214,7 +214,7 @@ public void ChooseBestActionForChapter() "standard_english", "standard_english" ); - var expectedAction = QuotationMarkUpdateStrategy.ApplyFull; + QuotationMarkUpdateStrategy expectedAction = QuotationMarkUpdateStrategy.ApplyFull; Assert.That(actualAction, Is.EqualTo(expectedAction)); // Verse text with unpaired opening quotation mark @@ -312,7 +312,7 @@ public void ChooseBestActionBasedOnObservedIssues() firstPassAnalyzer.WillFallbackModeWork = false; // Test with no issue - var bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); + QuotationMarkUpdateStrategy bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); Assert.That(bestAction, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); // Test with one issue @@ -367,7 +367,7 @@ public void ChooseBestActionBasedOnObservedIssuesWithBasicFallback() firstPassAnalyzer.WillFallbackModeWork = true; // Test with no issues - var bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); + QuotationMarkUpdateStrategy bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); Assert.That(bestAction, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); // Test with one issue @@ -417,7 +417,7 @@ public void ChooseBestActionBasedOnObservedIssuesWithBasicFallback() [Test] public void NoIssuesInUsfm() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -425,7 +425,11 @@ of the field which Yahweh God had made. ‘You shall not eat of any tree of the garden’?” "; List expectedActions = [QuotationMarkUpdateStrategy.ApplyFull]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -433,7 +437,7 @@ of the field which Yahweh God had made. [Test] public void UnpairedOpeningMark() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -441,7 +445,11 @@ of the field which Yahweh God had made. ‘You shall not eat of any tree of the garden’? "; List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -449,7 +457,7 @@ of the field which Yahweh God had made. [Test] public void UnpairedClosingMark() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -457,7 +465,11 @@ of the field which Yahweh God had made. You shall not eat of any tree of the garden?” "; List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -465,7 +477,7 @@ You shall not eat of any tree of the garden?” [Test] public void TooDeepNesting() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 “Now the serpent was more “subtle than any animal of the “field which “Yahweh God had made. @@ -473,7 +485,11 @@ of the “field which “Yahweh God had made. “You shall not eat of any tree of the garden? "; List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -481,7 +497,7 @@ of the “field which “Yahweh God had made. [Test] public void AmbiguousQuotationMark() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -489,7 +505,11 @@ of the field which Yahweh God had made. You shall not eat of any tree of the garden? "; List expectedActions = [QuotationMarkUpdateStrategy.Skip]; - var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -497,7 +517,7 @@ You shall not eat of any tree of the garden? [Test] public void NoIssuesInMultipleChapters() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -509,7 +529,11 @@ of the field which Yahweh God had made. QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.ApplyFull ]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -517,7 +541,7 @@ of the field which Yahweh God had made. [Test] public void UnpairedQuotationMarkInSecondChapter() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -529,7 +553,11 @@ You shall not eat of any tree of the garden?” QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.ApplyFallback ]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -537,7 +565,7 @@ You shall not eat of any tree of the garden?” [Test] public void UnpairedQuotationMarkInFirstChapter() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had” made. @@ -549,7 +577,11 @@ of the field which Yahweh God had” made. QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.ApplyFull ]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -557,7 +589,7 @@ of the field which Yahweh God had” made. [Test] public void AmbiguousQuotationMarkInSecondChapter() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -569,7 +601,11 @@ You shall not""eat of any tree of the garden?"" QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.Skip ]; - var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -577,7 +613,7 @@ You shall not""eat of any tree of the garden?"" [Test] public void AmbiguousQuotationMarkInFirstChapter() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field""which Yahweh God had made. @@ -589,7 +625,11 @@ of the field""which Yahweh God had made. QuotationMarkUpdateStrategy.Skip, QuotationMarkUpdateStrategy.ApplyFull ]; - var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -597,7 +637,7 @@ of the field""which Yahweh God had made. [Test] public void UnpairedQuotationMarkInBothChapters() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had” made. @@ -609,7 +649,11 @@ You shall not eat of any tree of the garden?” QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.ApplyFallback ]; - var observedActions = RunFirstPass(normalizedUsfm, "standard_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -617,7 +661,7 @@ You shall not eat of any tree of the garden?” [Test] public void AmbiguousQuotationMarkInBothChapters() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had""made. @@ -629,7 +673,11 @@ You shall not eat of any""tree of the garden? QuotationMarkUpdateStrategy.Skip, QuotationMarkUpdateStrategy.Skip ]; - var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -637,7 +685,7 @@ You shall not eat of any""tree of the garden? [Test] public void UnpairedInFirstAmbiguousInSecond() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made."" @@ -649,7 +697,11 @@ You shall not eat of any""tree of the garden? QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.Skip ]; - var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -657,7 +709,7 @@ You shall not eat of any""tree of the garden? [Test] public void AmbiguousInFirstUnpairedInSecond() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God""had made. @@ -669,7 +721,11 @@ You shall not eat of any tree of the garden ? "" QuotationMarkUpdateStrategy.Skip, QuotationMarkUpdateStrategy.ApplyFallback ]; - var observedActions = RunFirstPass(normalizedUsfm, "typewriter_english", "standard_english"); + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); Assert.That(expectedActions.SequenceEqual(observedActions)); } @@ -680,10 +736,14 @@ public List RunFirstPass( string targetQuoteConventionName ) { - var sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(sourceQuoteConventionName); + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + sourceQuoteConventionName + ); Assert.IsNotNull(sourceQuoteConvention); - var targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(targetQuoteConventionName); + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + targetQuoteConventionName + ); Assert.IsNotNull(targetQuoteConvention); var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); @@ -698,10 +758,14 @@ public QuotationMarkUpdateStrategy RunFirstPassOnChapter( string targetQuoteConventionName ) { - var sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(sourceQuoteConventionName); + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + sourceQuoteConventionName + ); Assert.IsNotNull(sourceQuoteConvention); - var targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(targetQuoteConventionName); + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + targetQuoteConventionName + ); Assert.IsNotNull(targetQuoteConvention); var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); @@ -715,7 +779,7 @@ string targetQuoteConventionName public QuoteConvention GetQuoteConventionByName(string name) { - var quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); + QuoteConvention quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); Assert.IsNotNull(quoteConvention); return quoteConvention; } diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs index 1cccbdf8..af5a264e 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -9,7 +9,7 @@ public class QuoteConventionChangingUsfmUpdateBlockHandlerTests [Test] public void QuotesSpanningVerses() { - var inputUsfm = + string inputUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -17,62 +17,62 @@ of the field which Yahweh God had made. \v 2 “You shall not eat of any tree of the garden”?» "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, \n" + "\\v 2 ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SingleEmbed() { - var inputUsfm = + string inputUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal \f + \ft «This is a “footnote”» \f* of the field which Yahweh God had made. "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal " + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." ); - var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void MultipleEmbeds() { - var inputUsfm = + string inputUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal \f + \ft «This is a “footnote”» \f* of the field \f + \ft Second «footnote» here \f* which Yahweh God had made. "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal " + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " + "“footnote” here \\f* which Yahweh God had made." ); - var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void QuotesInTextAndEmbed() { - var inputUsfm = + string inputUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -81,21 +81,21 @@ of the field which Yahweh God had made. “You shall not eat of any tree of the garden”?» "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + "said, ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void QuotesInMultipleVersesAndEmbed() { - var inputUsfm = + string inputUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -105,7 +105,7 @@ of the field which Yahweh God had made. “You shall not eat of any tree of the garden”?» "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God\n" @@ -113,7 +113,7 @@ of the field which Yahweh God had made. + "said, ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); // Fallback mode does not consider the nesting of quotation marks, @@ -123,20 +123,20 @@ of the field which Yahweh God had made. [Test] public void FallbackStrategySameAsFull() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’ "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "british_english", "standard_english", @@ -148,20 +148,20 @@ of the field which Yahweh God had made. [Test] public void FallbackStrategyIncorrectlyNested() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ‘Has God really said, ‘You shall not eat of any tree of the garden’?’ "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "british_english", "standard_english", @@ -173,20 +173,20 @@ of the field which Yahweh God had made. [Test] public void FallbackStrategyIncorrectlyNestedSecondCase() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?’ "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?”" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "british_english", "standard_english", @@ -198,20 +198,20 @@ of the field which Yahweh God had made. [Test] public void FallbackStrategyUnclosedQuote() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ‘Has God really said, You shall not eat of any tree of the garden”?’ "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "british_english", "standard_english", @@ -223,32 +223,32 @@ You shall not eat of any tree of the garden”?’ [Test] public void DefaultQuotationMarkUpdateStrategy() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, You shall not eat of any tree of the garden'?"" "; - var expectedFullUsfm = ( + string expectedFullUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" ); - var expectedBasicUsfm = ( + string expectedBasicUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" ); - var expectedSkippedUsfm = ( + string expectedSkippedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" ); - var observedUsfm = ChangeQuotationMarks(normalizedUsfm, "typewriter_english", "standard_english"); + string observedUsfm = ChangeQuotationMarks(normalizedUsfm, "typewriter_english", "standard_english"); AssertUsfmEqual(observedUsfm, expectedFullUsfm); observedUsfm = ChangeQuotationMarks( @@ -279,32 +279,32 @@ You shall not eat of any tree of the garden'?"" [Test] public void SingleChapterQuotationMarkUpdateStrategy() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, You shall not eat of any tree of the garden'?"" "; - var expectedFullUsfm = ( + string expectedFullUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" ); - var expectedBasicUsfm = ( + string expectedBasicUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" ); - var expectedSkippedUsfm = ( + string expectedSkippedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "typewriter_english", "standard_english", @@ -332,7 +332,7 @@ You shall not eat of any tree of the garden'?"" [Test] public void MultipleChapterSameStrategy() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle"" than any animal of the field which Yahweh God had made. @@ -340,21 +340,21 @@ of the field which Yahweh God had made. \v 1 He said to the woman, ""Has God really said, You shall not eat of any tree of the garden'?"" "; - var expectedFullUsfm = ( + string expectedFullUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle\" than any animal of the field which Yahweh God had made.\n" + "\\c 2\n" + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" ); - var expectedFallbackUsfm = ( + string expectedFallbackUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + "\\c 2\n" + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "typewriter_english", "standard_english", @@ -382,7 +382,7 @@ You shall not eat of any tree of the garden'?"" [Test] public void MultipleChapterMultipleStrategies() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle"" than any animal of the field which Yahweh God had made. @@ -390,28 +390,28 @@ of the field which Yahweh God had made. \v 1 He said to the woman, ""Has God really said, You shall not eat of any tree of the garden'?"" "; - var expectedFullThenFallbackUsfm = ( + string expectedFullThenFallbackUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle\" than any animal of the field which Yahweh God had made.\n" + "\\c 2\n" + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" ); - var expectedFallbackThenFullUsfm = ( + string expectedFallbackThenFullUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + "\\c 2\n" + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" ); - var expectedFallbackThenSkipUsfm = ( + string expectedFallbackThenSkipUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + "\\c 2\n" + "\\v 1 He said to the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "typewriter_english", "standard_english", @@ -445,20 +445,20 @@ You shall not eat of any tree of the garden'?"" [Test] public void MultiCharacterQuotationMarksInSourceQuoteConvention() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, <?>> "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "typewriter_french", "standard_english", @@ -470,20 +470,20 @@ of the field which Yahweh God had made. [Test] public void MultiCharacterQuotationMarksInTargetQuoteConvention() { - var normalizedUsfm = + string normalizedUsfm = @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” "; - var expectedUsfm = ( + string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, <?>>" ); - var observedUsfm = ChangeQuotationMarks( + string observedUsfm = ChangeQuotationMarks( normalizedUsfm, "standard_english", "typewriter_french", @@ -495,7 +495,7 @@ of the field which Yahweh God had made. [Test] public void ProcessScriptureElement() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "british_english") ); var quotationMarkFinder = new MockQuotationMarkFinder(); @@ -517,7 +517,7 @@ public void ProcessScriptureElement() [Test] public void CreateTextSegmentsBasic() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); @@ -525,7 +525,7 @@ public void CreateTextSegmentsBasic() UsfmUpdateBlockElementType.Text, tokens: [new UsfmToken("test segment")] ); - var textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + List textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); Assert.That(textSegments, Has.Count.EqualTo(1)); Assert.That(textSegments[0].Text, Is.EqualTo("test segment")); @@ -538,7 +538,7 @@ public void CreateTextSegmentsBasic() [Test] public void CreateTextSegmentsWithPrecedingMarkers() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); @@ -551,7 +551,7 @@ public void CreateTextSegmentsWithPrecedingMarkers() new UsfmToken("test segment"), ] ); - var textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + List textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); Assert.That(textSegments, Has.Count.EqualTo(1)); Assert.That(textSegments[0].Text, Is.EqualTo("test segment")); @@ -566,7 +566,7 @@ public void CreateTextSegmentsWithPrecedingMarkers() [Test] public void CreateTextSegmentsWithMultipleTextTokens() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); @@ -583,7 +583,7 @@ public void CreateTextSegmentsWithMultipleTextTokens() new UsfmToken(UsfmTokenType.Paragraph, null, null, null), ] ); - var textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + List textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); Assert.That(textSegments, Has.Count.EqualTo(2)); Assert.That(textSegments[0].Text, Is.EqualTo("test segment1")); @@ -605,12 +605,12 @@ public void CreateTextSegmentsWithMultipleTextTokens() [Test] public void CreateTextSegment() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); var usfmToken = new UsfmToken("test segment"); - var segment = quoteConventionChanger.InternalCreateTextSegment(usfmToken); + TextSegment segment = quoteConventionChanger.InternalCreateTextSegment(usfmToken); Assert.IsNotNull(segment); Assert.That(segment.Text, Is.EqualTo("test segment")); @@ -622,7 +622,7 @@ public void CreateTextSegment() [Test] public void SetPreviousAndNextForSegments() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); @@ -768,7 +768,7 @@ public void UpdateQuotationMarks() [Test] public void CheckForChapterChange() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") ); @@ -788,7 +788,7 @@ public void CheckForChapterChange() [Test] public void StartNewChapter() { - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler( "standard_english", "standard_english", @@ -813,7 +813,7 @@ public void StartNewChapter() ); quoteConventionChanger.InternalStartNewChapter(1); - var segment = quoteConventionChanger.InternalNextScriptureTextSegmentBuilder.Build(); + TextSegment segment = quoteConventionChanger.InternalNextScriptureTextSegmentBuilder.Build(); Assert.That(quoteConventionChanger.InternalCurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.Skip)); Assert.That(segment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Chapter)); Assert.That(segment.Text, Is.EqualTo("")); @@ -838,7 +838,7 @@ private static string ChangeQuotationMarks( ) { quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); - var quoteConventionChanger = ( + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( CreateQuoteConventionChangingUsfmUpdateBlockHandler( sourceQuoteConventionName, targetQuoteConventionName, @@ -859,10 +859,14 @@ private static MockQuoteConventionChangingUsfmUpdateBlockHandler CreateQuoteConv ) { quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); - var sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(sourceQuoteConventionName); + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + sourceQuoteConventionName + ); Assert.IsNotNull(sourceQuoteConvention); - var targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(targetQuoteConventionName); + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + targetQuoteConventionName + ); Assert.IsNotNull(targetQuoteConvention); return new MockQuoteConventionChangingUsfmUpdateBlockHandler( @@ -994,7 +998,7 @@ IReadOnlyList quoteMatches { NumTimesCalled++; int currentDepth = 1; - var currentDirection = QuotationMarkDirection.Opening; + QuotationMarkDirection currentDirection = QuotationMarkDirection.Opening; foreach (QuotationMarkStringMatch quoteMatch in quoteMatches) { yield return quoteMatch.Resolve(currentDepth, currentDirection); diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 4a9407f5..cb1092a6 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -39,7 +39,7 @@ public void GetUsfm_StripAllText() new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), new UpdateUsfmRow(ScrRef("MAT 1:3"), "Update 3") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \r keep this reference @@ -60,7 +60,7 @@ public void GetUsfm_StripAllText() styleBehavior: UpdateUsfmMarkerBehavior.Preserve ); - var result = + string result = @"\id MAT \c 1 \r keep this reference @@ -135,7 +135,7 @@ public void GetUsfm_StripParagraphs_PreserveParagraphStyles() AssertUsfmEquals(target, result); - var targetDiffParagraph = UpdateUsfm( + string targetDiffParagraph = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.StripExisting, @@ -183,7 +183,7 @@ public void GetUsfm_PreserveParagraphs() AssertUsfmEquals(target, result); - var targetDiffParagraph = UpdateUsfm( + string targetDiffParagraph = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.StripExisting, @@ -253,7 +253,7 @@ public void GetUsfm_PreferExisting() new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 Some text @@ -261,7 +261,7 @@ public void GetUsfm_PreferExisting() \v 3 Other text "; string target = UpdateUsfm(rows, usfm, textBehavior: UpdateUsfmTextBehavior.PreferExisting); - var result = + string result = @"\id MAT - Test \c 1 \v 1 Some text @@ -301,13 +301,13 @@ public void GetUsfm_Verse_StripNote() public void GetUsfm_Verse_ReplaceWithNote() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "updated text") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 Chapter \add one\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 updated text \f + \fr 2:1: \ft This is a footnote.\f* @@ -670,7 +670,7 @@ public void GetUsfm_StripParagraphs() new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update Verse 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \p This is a paragraph before any verses @@ -683,7 +683,7 @@ public void GetUsfm_StripParagraphs() "; string target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve); - var resultP = + string resultP = @"\id MAT - Test \c 1 \p This is a paragraph before any verses @@ -697,7 +697,7 @@ public void GetUsfm_StripParagraphs() AssertUsfmEquals(target, resultP); target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); - var resultS = + string resultS = @"\id MAT - Test \c 1 \p This is a paragraph before any verses @@ -717,14 +717,14 @@ public void GetUsfm_PreservationRawStrings() new UpdateUsfmRow(ScrRef("MAT 1:1"), @"Update all in one row \f \fr 1.1 \ft Some note \f*") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 \f \fr 1.1 \ft Some note \f*Hello World "; string target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result = + string result = @"\id MAT - Test \c 1 \v 1 Update all in one row \f \fr 1.1 \ft Some note \f* @@ -737,14 +737,14 @@ public void GetUsfm_BeginningOfVerseEmbed() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), @"Updated text") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 \f \fr 1.1 \ft Some note \f* Text after note "; string target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result = + string result = @"\id MAT - Test \c 1 \v 1 Updated text @@ -756,13 +756,13 @@ public void GetUsfm_BeginningOfVerseEmbed() public void CrossReferenceDontUpdate() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1/1:x"), "Update the cross reference"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 Cross reference verse \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 Cross reference verse \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. @@ -774,13 +774,13 @@ public void CrossReferenceDontUpdate() public void PreserveFig() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 initial text \fig stuff\fig* more text and more. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 Update \fig stuff\fig* @@ -796,13 +796,13 @@ public void NoteExplicitEndMarkers() new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update text"), new UpdateUsfmRow(ScrRef("MAT 1:1/1:f"), "Update note"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 initial text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* and the end. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 Update text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* @@ -810,7 +810,7 @@ public void NoteExplicitEndMarkers() AssertUsfmEquals(target, result); target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result2 = + string result2 = @"\id MAT - Test \c 1 \v 1 Update text @@ -822,7 +822,7 @@ public void NoteExplicitEndMarkers() public void UpdateBlock_Verse_PreserveParas() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 verse 1 \p inner verse paragraph @@ -852,7 +852,7 @@ public void UpdateBlock_Verse_PreserveParas() public void UpdateBlock_Verse_StripParas() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 verse 1 \p inner verse paragraph @@ -882,7 +882,7 @@ public void UpdateBlock_Verse_StripParas() public void UpdateBlock_Verse_Range() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1-3 verse 1 through 3 @@ -910,7 +910,7 @@ public void UpdateBlock_Verse_Range() public void UpdateBlock_Footnote_PreserveEmbeds() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 verse\f \fr 1.1 \ft Some note \f* 1 @@ -940,7 +940,7 @@ public void UpdateBlock_Footnote_PreserveEmbeds() public void UpdateBlock_Footnote_StripEmbeds() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 verse\f \fr 1.1 \ft Some note \f* 1 @@ -970,7 +970,7 @@ public void UpdateBlock_Footnote_StripEmbeds() public void UpdateBlock_NonVerse() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/1:s"), "Updated section Header") }; - var usfm = + string usfm = @"\id MAT - Test \s Section header \c 1 @@ -994,7 +994,7 @@ public void UpdateBlock_NonVerse() public void UpdateBlock_Verse_PreserveStyles() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 verse \bd 1\bd* @@ -1026,7 +1026,7 @@ public void UpdateBlock_Verse_PreserveStyles() public void UpdateBlock_Verse_StripStyles() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 verse \bd 1\bd* @@ -1058,7 +1058,7 @@ public void UpdateBlock_Verse_StripStyles() public void UpdateBlock_Verse_SectionHeader() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \p @@ -1096,7 +1096,7 @@ public void UpdateBlock_Verse_SectionHeader() public void UpdateBlock_Verse_SectionHeaderInVerse() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \p @@ -1129,7 +1129,7 @@ public void UpdateBlock_Verse_SectionHeaderInVerse() public void UpdateBlock_NonVerse_ParagraphEndOfVerse() { var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \p @@ -1166,7 +1166,7 @@ public void GetUsfm_HeaderReferenceParagraphs() new UpdateUsfmRow(ScrRef("MAT 2:2"), "new verse 2") }; - var usfm = + string usfm = @"\id MAT \c 1 \s1 beginning-of-chapter header @@ -1187,7 +1187,7 @@ public void GetUsfm_HeaderReferenceParagraphs() "; string target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); - var resultP = + string resultP = @"\id MAT \c 1 \s1 beginning-of-chapter header @@ -1217,7 +1217,7 @@ public void GetUsfm_PreferExisting_AddRemark() new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 Some text @@ -1230,7 +1230,7 @@ public void GetUsfm_PreferExisting_AddRemark() textBehavior: UpdateUsfmTextBehavior.PreferExisting, remarks: ["New remark"] ); - var result = + string result = @"\id MAT - Test \rem New remark \c 1 @@ -1298,8 +1298,8 @@ private static string UpdateUsfm( private static void AssertUsfmEquals(string target, string truth) { Assert.That(target, Is.Not.Null); - var target_lines = target.Split(["\n"], StringSplitOptions.None); - var truth_lines = truth.Split(["\n"], StringSplitOptions.None); + string[] target_lines = target.Split(["\n"], StringSplitOptions.None); + string[] truth_lines = truth.Split(["\n"], StringSplitOptions.None); for (int i = 0; i < truth_lines.Length; i++) { Assert.That(target_lines[i].Trim(), Is.EqualTo(truth_lines[i].Trim()), message: $"Line {i}"); @@ -1312,7 +1312,7 @@ private static void AssertUpdateBlockEquals( params (UsfmUpdateBlockElementType, string, bool)[] expectedElements ) { - var parsedExtractedRefs = expectedRefs.Select(r => ScriptureRef.Parse(r)); + IEnumerable parsedExtractedRefs = expectedRefs.Select(r => ScriptureRef.Parse(r)); Assert.That(block.Refs.SequenceEqual(parsedExtractedRefs)); Assert.That(block.Elements.Count, Is.EqualTo(expectedElements.Length)); foreach ( diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs index 4b9562f2..3576ed95 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -281,7 +281,7 @@ public void AddQuotationContinuer() var continuerState = new TestQuoteContinuerState(); - var result1 = continuerState.AddQuoteContinuer( + QuotationMarkMetadata result1 = continuerState.AddQuoteContinuer( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), resolverState, QuoteContinuerStyle.English @@ -300,7 +300,7 @@ public void AddQuotationContinuer() ) ); - var result2 = continuerState.AddQuoteContinuer( + QuotationMarkMetadata result2 = continuerState.AddQuoteContinuer( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), resolverState, QuoteContinuerStyle.Spanish @@ -320,7 +320,7 @@ public void AddQuotationContinuer() ); Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Spanish)); - var result3 = continuerState.AddQuoteContinuer( + QuotationMarkMetadata result3 = continuerState.AddQuoteContinuer( new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), resolverState, QuoteContinuerStyle.English @@ -343,7 +343,7 @@ public void AddQuotationContinuer() [Test] public void IsEnglishQuotationContinuer() { - var standardEnglish = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention standardEnglish = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); Assert.IsNotNull(standardEnglish); var settings = new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([standardEnglish])); @@ -644,7 +644,9 @@ public void IsEnglishQuotationContinuer() [Test] public void IsSpanishQuotationContinuer() { - var westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("western_european"); + QuoteConvention westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "western_european" + ); Assert.IsNotNull(westernEuropeanQuoteConvention); var settings = new QuoteConventionDetectionResolutionSettings( @@ -947,7 +949,9 @@ public void IsSpanishQuotationContinuer() [Test] public void IsOpeningQuote() { - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -960,7 +964,9 @@ public void IsOpeningQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -971,7 +977,9 @@ public void IsOpeningQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -1283,7 +1291,9 @@ public void IsOpeningQuote() [Test] public void IsClosingQuote() { - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -1296,7 +1306,9 @@ public void IsClosingQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -1307,7 +1319,9 @@ public void IsClosingQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -1318,7 +1332,9 @@ public void IsClosingQuote() quotationContinuerState ); - var standardFrenchQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_french")); + QuoteConvention standardFrenchQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_french") + ); Assert.IsNotNull(standardFrenchQuoteConvention); var standardFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardFrenchQuoteConvention]) @@ -1547,7 +1563,9 @@ public void IsClosingQuote() [Test] public void IsMalformedOpeningQuote() { - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -1560,7 +1578,9 @@ public void IsMalformedOpeningQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -1571,7 +1591,9 @@ public void IsMalformedOpeningQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -1846,7 +1868,9 @@ public void IsMalformedOpeningQuote() [Test] public void IsMalformedClosingQuote() { - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -1859,7 +1883,9 @@ public void IsMalformedClosingQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -1870,7 +1896,9 @@ public void IsMalformedClosingQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -2122,7 +2150,9 @@ public void IsMalformedClosingQuote() [Test] public void IsUnpairedClosingQuote() { - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) @@ -2135,7 +2165,9 @@ public void IsUnpairedClosingQuote() quotationContinuerState ); - var britishEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("british_english")); + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); Assert.IsNotNull(britishEnglishQuoteConvention); var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([britishEnglishQuoteConvention]) @@ -2146,7 +2178,9 @@ public void IsUnpairedClosingQuote() quotationContinuerState ); - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) @@ -2395,7 +2429,9 @@ public void IsUnpairedClosingQuote() [Test] public void IsApostrophe() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2408,7 +2444,7 @@ public void IsApostrophe() quotationContinuerState ); - var typewriterEnglishQuoteConvention = ( + QuoteConvention typewriterEnglishQuoteConvention = ( QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); @@ -2661,7 +2697,9 @@ public void IsApostrophe() [Test] public void DepthBasedQuotationMarkResolverReset() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) @@ -2705,14 +2743,16 @@ [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201cThis is a [Test] public void BasicQuotationMarkRecognition() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); Assert.That( standardEnglishQuotationMarkResolver .ResolveQuotationMarks( @@ -2738,14 +2778,16 @@ public void BasicQuotationMarkRecognition() [Test] public void ResolutionOnlyOfPassedMatches() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); Assert.That( standardEnglishQuotationMarkResolver .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 0, 1),]) @@ -2776,15 +2818,17 @@ [new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegm [Test] public void ResolutionAcrossSegments() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a ").Build(); - var textSegment2 = new TextSegment.Builder().SetText("\u2018quote\u2019\u201d").Build(); + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a ").Build(); + TextSegment textSegment2 = new TextSegment.Builder().SetText("\u2018quote\u2019\u201d").Build(); Assert.That( standardEnglishQuotationMarkResolver .ResolveQuotationMarks( @@ -2810,14 +2854,16 @@ public void ResolutionAcrossSegments() [Test] public void ResolutionWithApostrophes() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment = ( + TextSegment textSegment = ( new TextSegment.Builder() .SetText("\u201cThis\u2019 is a \u2018quote\u2019\u201d") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -2845,7 +2891,7 @@ public void ResolutionWithApostrophes() ); Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); - var typewriterEnglishQuoteConvention = ( + QuoteConvention typewriterEnglishQuoteConvention = ( QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); @@ -2886,15 +2932,17 @@ public void ResolutionWithApostrophes() [Test] public void EnglishQuoteContinuers() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote").Build(); - var textSegment2 = ( + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote").Build(); + TextSegment textSegment2 = ( new TextSegment.Builder() .SetText("\u201c\u2018This is the rest\u2019 of it\u201d") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -2929,15 +2977,17 @@ public void EnglishQuoteContinuers() [Test] public void SpanishQuoteContinuers() { - var westernEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("western_european")); + QuoteConvention westernEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("western_european") + ); Assert.IsNotNull(westernEuropeanQuoteConvention); var westernEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([westernEuropeanQuoteConvention]) ); var westernEuropeanQuotationMarkResolver = new DepthBasedQuotationMarkResolver(westernEuropeanResolverSettings); - var textSegment1 = new TextSegment.Builder().SetText("\u00abThis is a \u201cquote").Build(); - var textSegment2 = ( + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u00abThis is a \u201cquote").Build(); + TextSegment textSegment2 = ( new TextSegment.Builder() .SetText("\u00bb\u201dThis is the rest\u201d of it\u00bb") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -2972,15 +3022,17 @@ public void SpanishQuoteContinuers() [Test] public void MalformedQuotationMarks() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment1 = new TextSegment.Builder().SetText("\u201c This is a,\u2018 quote").Build(); - var textSegment2 = ( + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u201c This is a,\u2018 quote").Build(); + TextSegment textSegment2 = ( new TextSegment.Builder() .SetText("This is the rest \u2019 of it \u201d") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -3011,14 +3063,16 @@ public void MalformedQuotationMarks() [Test] public void UnpairedQuotationMarkIssue() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019").Build(); Assert.That( standardEnglishQuotationMarkResolver .ResolveQuotationMarks( @@ -3060,14 +3114,16 @@ [new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegm [Test] public void TooDeepNestingIssue() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment = new TextSegment.Builder() + TextSegment textSegment = new TextSegment.Builder() .SetText("\u201cThis \u2018is \u201ca \u2018quote \u201cnested too deeply") .Build(); Assert.That( @@ -3102,14 +3158,16 @@ public void TooDeepNestingIssue() [Test] public void IncompatibleQuotationMarkIssue() { - var standardEnglishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_english")); + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); Assert.IsNotNull(standardEnglishQuoteConvention); var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardEnglishQuoteConvention]) ); var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); - var textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u201cquote\u201d\u201d").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u201cquote\u201d\u201d").Build(); Assert.That( standardEnglishQuotationMarkResolver .ResolveQuotationMarks( @@ -3139,7 +3197,7 @@ public void IncompatibleQuotationMarkIssue() [Test] public void AmbiguousQuotationMarkIssue() { - var typewriterEnglishQuoteConvention = ( + QuoteConvention typewriterEnglishQuoteConvention = ( QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); @@ -3150,7 +3208,7 @@ public void AmbiguousQuotationMarkIssue() typewriterEnglishResolverSettings ); - var textSegment = new TextSegment.Builder().SetText("This\"is an ambiguous quotation mark").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("This\"is an ambiguous quotation mark").Build(); Assert.That( typewriterEnglishQuotationMarkResolver .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 4, 5),]) @@ -3181,7 +3239,7 @@ public void AmbiguousQuotationMarkIssue() [Test] public void TypewriterEnglishQuotationMarkRecognition() { - var typewriterEnglishQuoteConvention = ( + QuoteConvention typewriterEnglishQuoteConvention = ( QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") ); Assert.IsNotNull(typewriterEnglishQuoteConvention); @@ -3192,7 +3250,7 @@ public void TypewriterEnglishQuotationMarkRecognition() typewriterEnglishResolverSettings ); - var textSegment = ( + TextSegment textSegment = ( new TextSegment.Builder() .SetText("\"This is a 'quote'\"") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -3223,7 +3281,9 @@ public void TypewriterEnglishQuotationMarkRecognition() [Test] public void TypewriterFrenchMarkRecognition() { - var typewriterFrenchQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("typewriter_french")); + QuoteConvention typewriterFrenchQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_french") + ); Assert.IsNotNull(typewriterFrenchQuoteConvention); var typewriterFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([typewriterFrenchQuoteConvention]) @@ -3232,7 +3292,7 @@ public void TypewriterFrenchMarkRecognition() typewriterFrenchResolverSettings ); - var textSegment = new TextSegment.Builder().SetText("<>>").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("<>>").Build(); Assert.That( typewriterFrenchQuotationMarkResolver .ResolveQuotationMarks( @@ -3258,14 +3318,16 @@ public void TypewriterFrenchMarkRecognition() [Test] public void CentralEuropeanQuotationMarkRecognition() { - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([centralEuropeanQuoteConvention]) ); var centralEuropeanQuotationMarkResolver = new DepthBasedQuotationMarkResolver(centralEuropeanResolverSettings); - var textSegment = ( + TextSegment textSegment = ( new TextSegment.Builder() .SetText("\u201eThis is a \u201aquote\u2018\u201c") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -3296,14 +3358,16 @@ public void CentralEuropeanQuotationMarkRecognition() [Test] public void StandardSwedishQuotationMarkRecognition() { - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet([standardSwedishQuoteConvention]) ); var standardSwedishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardSwedishResolverSettings); - var textSegment = ( + TextSegment textSegment = ( new TextSegment.Builder() .SetText("\u201dThis is a \u2019quote\u2019\u201d") .AddPrecedingMarker(UsfmMarkerType.Paragraph) @@ -3334,14 +3398,20 @@ public void StandardSwedishQuotationMarkRecognition() [Test] public void MultipleConventionsQuotationMarkRecognition() { - var typewriterFrenchQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("typewriter_french"); + QuoteConvention typewriterFrenchQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "typewriter_french" + ); Assert.IsNotNull(typewriterFrenchQuoteConvention); - var centralEuropeanQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("central_european")); + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); Assert.IsNotNull(centralEuropeanQuoteConvention); - var standardSwedishQuoteConvention = (QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish")); + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); Assert.IsNotNull(standardSwedishQuoteConvention); var multipleConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( new QuoteConventionSet( @@ -3352,7 +3422,7 @@ public void MultipleConventionsQuotationMarkRecognition() multipleConventionsResolverSettings ); - var textSegment = ( + TextSegment textSegment = ( new TextSegment.Builder() .SetText("\u201eThis is a \u2019quote>\u201c") .AddPrecedingMarker(UsfmMarkerType.Paragraph) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs index fc75c264..8b34a377 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs @@ -10,7 +10,7 @@ public class QuotationConventionDetectorTests [Test] public void StandardEnglish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -18,7 +18,7 @@ of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); } @@ -26,7 +26,7 @@ of the field which Yahweh God had made. [Test] public void TypewriterEnglish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -34,7 +34,7 @@ of the field which Yahweh God had made. He said to the woman, ""Has God really said, 'You shall not eat of any tree of the garden'?\"" "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_english")); } @@ -42,7 +42,7 @@ of the field which Yahweh God had made. [Test] public void BritishEnglish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -50,7 +50,7 @@ of the field which Yahweh God had made. He said to the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’ "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_english")); } @@ -58,7 +58,7 @@ of the field which Yahweh God had made. [Test] public void BritishTypewriterEnglish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -66,7 +66,7 @@ of the field which Yahweh God had made. He said to the woman, 'Has God really said, ""You shall not eat of any tree of the garden""?' "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_typewriter_english")); } @@ -74,7 +74,7 @@ of the field which Yahweh God had made. [Test] public void HybridTypewriterEnglish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -82,7 +82,7 @@ of the field which Yahweh God had made. He said to the woman, “Has God really said, 'You shall not eat of any tree of the garden'?” "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_english")); } @@ -90,7 +90,7 @@ of the field which Yahweh God had made. [Test] public void StandardFrench() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -98,7 +98,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); } @@ -106,7 +106,7 @@ of the field which Yahweh God had made. [Test] public void TypewriterFrench() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -114,7 +114,7 @@ of the field which Yahweh God had made. He said to the woman, <?>> "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_french")); } @@ -123,7 +123,7 @@ of the field which Yahweh God had made. [Test] public void WesternEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -131,7 +131,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, “You shall not eat of any tree of the garden”?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("western_european")); } @@ -139,7 +139,7 @@ of the field which Yahweh God had made. [Test] public void BritishInspiredWesternEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -147,7 +147,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, ‘You shall not eat of any tree of the garden’?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_inspired_western_european")); } @@ -155,7 +155,7 @@ of the field which Yahweh God had made. [Test] public void TypewriterWesternEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -163,7 +163,7 @@ of the field which Yahweh God had made. He said to the woman, <> "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european")); } @@ -171,7 +171,7 @@ of the field which Yahweh God had made. [Test] public void TypewriterWesternEuropeanVariant() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -179,7 +179,7 @@ of the field which Yahweh God had made. He said to the woman, ""Has God really said, ?"" "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european_variant")); } @@ -187,7 +187,7 @@ of the field which Yahweh God had made. [Test] public void HybridTypewriterWesternEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -195,7 +195,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, ""You shall not eat of any tree of the garden""?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_western_european")); } @@ -203,7 +203,7 @@ of the field which Yahweh God had made. [Test] public void HybridBritishTypewriterWesternEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -211,7 +211,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, 'You shall not eat of any tree of the garden'?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_british_typewriter_western_european")); } @@ -219,7 +219,7 @@ of the field which Yahweh God had made. [Test] public void CentralEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -227,7 +227,7 @@ of the field which Yahweh God had made. He said to the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“ "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european")); } @@ -235,7 +235,7 @@ of the field which Yahweh God had made. [Test] public void CentralEuropeanGuillemets() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -243,7 +243,7 @@ of the field which Yahweh God had made. He said to the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?« "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european_guillemets")); } @@ -251,7 +251,7 @@ of the field which Yahweh God had made. [Test] public void StandardSwedish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -259,7 +259,7 @@ of the field which Yahweh God had made. He said to the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?” "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_swedish")); } @@ -267,7 +267,7 @@ of the field which Yahweh God had made. [Test] public void StandardFinnish() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -275,7 +275,7 @@ of the field which Yahweh God had made. He said to the woman, »Has God really said, ’You shall not eat of any tree of the garden’?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_finnish")); } @@ -283,7 +283,7 @@ of the field which Yahweh God had made. [Test] public void EasternEuropean() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -291,7 +291,7 @@ of the field which Yahweh God had made. He said to the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?” "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("eastern_european")); } @@ -299,7 +299,7 @@ of the field which Yahweh God had made. [Test] public void StandardRussian() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -307,7 +307,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, „You shall not eat of any tree of the garden“?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_russian")); } @@ -315,7 +315,7 @@ of the field which Yahweh God had made. [Test] public void StandardArabic() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -323,7 +323,7 @@ of the field which Yahweh God had made. He said to the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“ "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_arabic")); } @@ -331,7 +331,7 @@ of the field which Yahweh God had made. [Test] public void NonStandardArabic() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -339,7 +339,7 @@ of the field which Yahweh God had made. He said to the woman, «Has God really said, ’You shall not eat of any tree of the garden‘?» "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("non-standard_arabic")); } @@ -347,7 +347,7 @@ of the field which Yahweh God had made. [Test] public void MismatchedQuotationMarks() { - var usfm = + string usfm = @" \c 1 \v 1 Now the serpent was more subtle than any animal @@ -359,7 +359,7 @@ of the field which Yahweh God had made. \\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ "; - var analysis = DetectQuotationConvention(usfm); + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); Assert.IsNotNull(analysis); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index b1b3f407..0c70b746 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -287,7 +287,9 @@ public void ThatAllPossibleQuotationMarksAreIdentified() [Test] public void ThatItUsesTheQuoteConventionSet() { - var standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + QuoteConvention standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "standard_english" + ); Assert.IsNotNull(standardEnglishQuoteConvention); var englishQuotationMarkFinder = new QuotationMarkFinder( @@ -304,7 +306,9 @@ public void ThatItUsesTheQuoteConventionSet() Has.Count.EqualTo(0) ); - var typewriterEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english"); + QuoteConvention typewriterEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "typewriter_english" + ); Assert.IsNotNull(typewriterEnglishQuoteConvention); var typewriterEnglishQuotationMarkFinder = new QuotationMarkFinder( @@ -330,7 +334,9 @@ public void ThatItUsesTheQuoteConventionSet() ) ); - var westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("western_european"); + QuoteConvention westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "western_european" + ); Assert.IsNotNull(westernEuropeanQuoteConvention); var westernEuropeanQuotationMarkFinder = new QuotationMarkFinder( @@ -356,7 +362,7 @@ public void ThatItUsesTheQuoteConventionSet() ) ); - var typewriterWesternEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + QuoteConvention typewriterWesternEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( "typewriter_western_european" ); Assert.IsNotNull(typewriterWesternEuropeanQuoteConvention); @@ -391,7 +397,9 @@ public void ThatItUsesTheQuoteConventionSet() ) ); - var centralEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("central_european"); + QuoteConvention centralEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "central_european" + ); Assert.IsNotNull(centralEuropeanQuoteConvention); var centralEuropeanQuotationMarkFinder = new QuotationMarkFinder( diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs index 26fc1c37..cac692e7 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -313,7 +313,7 @@ public void GetContext() [Test] public void Resolve() { - var textSegment = new TextSegment.Builder().SetText("'").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("'").Build(); var quotationMarkStringMatch = new QuotationMarkStringMatch(textSegment, 0, 1); Assert.That( quotationMarkStringMatch.Resolve(2, QuotationMarkDirection.Opening), diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs index ed25d4e2..305282c6 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs @@ -1711,7 +1711,7 @@ public void FindMostSimilarConvention() ), ] ); - var (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention( + (QuoteConvention convention, double similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention( noisyMultipleEnglishQuotesTabulator ); Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs index e7953468..0275abfa 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs @@ -9,82 +9,89 @@ public class QuoteConventionTests public void SingleLevelQuoteConventionNormalize() { var englishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201c", "\u201d"); - var normalizedEnglishLevel1QuoteConvention = englishLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedEnglishLevel1QuoteConvention = englishLevel1QuoteConvention.Normalize(); Assert.That(normalizedEnglishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedEnglishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var englishLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2018", "\u2019"); - var normalizedEnglishLevel2QuoteConvention = englishLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedEnglishLevel2QuoteConvention = englishLevel2QuoteConvention.Normalize(); Assert.That(normalizedEnglishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); Assert.That(normalizedEnglishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); var alreadyNormalizedEnglishLevel1QuoteConvention = new SingleLevelQuoteConvention("\"", "\""); - var doublyNormalizedEnglishLevel1QuoteConvention = alreadyNormalizedEnglishLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention doublyNormalizedEnglishLevel1QuoteConvention = + alreadyNormalizedEnglishLevel1QuoteConvention.Normalize(); Assert.That(doublyNormalizedEnglishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(doublyNormalizedEnglishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var alreadyNormalizedEnglishLevel2QuoteConvention = new SingleLevelQuoteConvention("'", "'"); - var doublyNormalizedEnglishLevel2QuoteConvention = alreadyNormalizedEnglishLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention doublyNormalizedEnglishLevel2QuoteConvention = + alreadyNormalizedEnglishLevel2QuoteConvention.Normalize(); Assert.That(doublyNormalizedEnglishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); Assert.That(doublyNormalizedEnglishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); var frenchLevel1QuoteConvention = new SingleLevelQuoteConvention("\u00ab", "\u00bb"); - var normalizedFrenchLevel1QuoteConvention = frenchLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedFrenchLevel1QuoteConvention = frenchLevel1QuoteConvention.Normalize(); Assert.That(normalizedFrenchLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedFrenchLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var frenchLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2039", "\u203a"); - var normalizedFrenchLevel2QuoteConvention = frenchLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedFrenchLevel2QuoteConvention = frenchLevel2QuoteConvention.Normalize(); Assert.That(normalizedFrenchLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("\u2039")); Assert.That(normalizedFrenchLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("\u203a")); var typewriterFrenchLevel1QuoteConvention = new SingleLevelQuoteConvention("<<", ">>"); - var normalizedTypewriterFrenchLevel1QuoteConvention = typewriterFrenchLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedTypewriterFrenchLevel1QuoteConvention = + typewriterFrenchLevel1QuoteConvention.Normalize(); Assert.That(normalizedTypewriterFrenchLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("<<")); Assert.That(normalizedTypewriterFrenchLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo(">>")); var typewriterFrenchLevel2QuoteConvention = new SingleLevelQuoteConvention("<", ">"); - var normalizedTypewriterFrenchLevel2QuoteConvention = typewriterFrenchLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedTypewriterFrenchLevel2QuoteConvention = + typewriterFrenchLevel2QuoteConvention.Normalize(); Assert.That(normalizedTypewriterFrenchLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("<")); Assert.That(normalizedTypewriterFrenchLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo(">")); var centralEuropeanLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201e", "\u201c"); - var normalizedCentralEuropeanLevel1QuoteConvention = centralEuropeanLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedCentralEuropeanLevel1QuoteConvention = + centralEuropeanLevel1QuoteConvention.Normalize(); Assert.That(normalizedCentralEuropeanLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedCentralEuropeanLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var centralEuropeanLevel2QuoteConvention = new SingleLevelQuoteConvention("\u201a", "\u2018"); - var normalizedCentralEuropeanLevel2QuoteConvention = centralEuropeanLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedCentralEuropeanLevel2QuoteConvention = + centralEuropeanLevel2QuoteConvention.Normalize(); Assert.That(normalizedCentralEuropeanLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); Assert.That(normalizedCentralEuropeanLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); var centralEuropeanGuillemetsQuoteConvention = new SingleLevelQuoteConvention("\u00bb", "\u00ab"); - var normalizedCentralEuropeanGuillemetsQuoteConvention = centralEuropeanGuillemetsQuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedCentralEuropeanGuillemetsQuoteConvention = + centralEuropeanGuillemetsQuoteConvention.Normalize(); Assert.That(normalizedCentralEuropeanGuillemetsQuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedCentralEuropeanGuillemetsQuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var swedishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201d", "\u201d"); - var normalizedSwedishLevel1QuoteConvention = swedishLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedSwedishLevel1QuoteConvention = swedishLevel1QuoteConvention.Normalize(); Assert.That(normalizedSwedishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedSwedishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var swedishLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2019", "\u2019"); - var normalizedSwedishLevel2QuoteConvention = swedishLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedSwedishLevel2QuoteConvention = swedishLevel2QuoteConvention.Normalize(); Assert.That(normalizedSwedishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); Assert.That(normalizedSwedishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); var finnishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u00bb", "\u00bb"); - var normalizedFinnishLevel1QuoteConvention = finnishLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedFinnishLevel1QuoteConvention = finnishLevel1QuoteConvention.Normalize(); Assert.That(normalizedFinnishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedFinnishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var arabicLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201d", "\u201c"); - var normalizedArabicLevel1QuoteConvention = arabicLevel1QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedArabicLevel1QuoteConvention = arabicLevel1QuoteConvention.Normalize(); Assert.That(normalizedArabicLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); Assert.That(normalizedArabicLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); var arabicLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2019", "\u2018"); - var normalizedArabicLevel2QuoteConvention = arabicLevel2QuoteConvention.Normalize(); + SingleLevelQuoteConvention normalizedArabicLevel2QuoteConvention = arabicLevel2QuoteConvention.Normalize(); Assert.That(normalizedArabicLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); Assert.That(normalizedArabicLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); } @@ -331,7 +338,7 @@ public void IsCompatibleWithObservedQuotationMarks() public void Normalize() { var emptyQuoteConvention = new QuoteConvention("empty_quote_convention", []); - var normalizedEmptyQuoteConvention = emptyQuoteConvention.Normalize(); + QuoteConvention normalizedEmptyQuoteConvention = emptyQuoteConvention.Normalize(); Assert.That(normalizedEmptyQuoteConvention.Name, Is.EqualTo("empty_quote_convention_normalized")); Assert.That(normalizedEmptyQuoteConvention.NumLevels, Is.EqualTo(0)); @@ -344,7 +351,7 @@ public void Normalize() new SingleLevelQuoteConvention("\u2018", "\u2019"), ] ); - var normalizedStandardEnglishQuoteConvention = standardEnglishQuoteConvention.Normalize(); + QuoteConvention normalizedStandardEnglishQuoteConvention = standardEnglishQuoteConvention.Normalize(); Assert.That( normalizedStandardEnglishQuoteConvention.Name, Is.EqualTo("standard_english_quote_convention_normalized") @@ -367,7 +374,7 @@ public void Normalize() new SingleLevelQuoteConvention("\u2018", "\u2019"), ] ); - var normalizedWesternEuropeanQuoteConvention = westernEuropeanQuoteConvention.Normalize(); + QuoteConvention normalizedWesternEuropeanQuoteConvention = westernEuropeanQuoteConvention.Normalize(); Assert.That(normalizedWesternEuropeanQuoteConvention.Name, Is.EqualTo("test_quote_convention_normalized")); Assert.That(normalizedWesternEuropeanQuoteConvention.NumLevels, Is.EqualTo(3)); Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); @@ -386,7 +393,7 @@ public void Normalize() ] ); - var normalizedHybridBritishTypewriterEnglishQuoteConvention = ( + QuoteConvention normalizedHybridBritishTypewriterEnglishQuoteConvention = ( hybridBritishTypewriterEnglishQuoteConvention.Normalize() ); Assert.IsTrue( diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs index 967c0f3b..2870d81e 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -10,7 +10,7 @@ public class TextSegmentTests public void BuilderInitialization() { var builder = new TextSegment.Builder(); - var textSegment = builder.Build(); + TextSegment textSegment = builder.Build(); Assert.That(textSegment.Text, Is.EqualTo("")); Assert.IsNull(textSegment.PreviousSegment); @@ -26,7 +26,7 @@ public void BuilderInitialization() public void BuilderSetText() { var builder = new TextSegment.Builder(); - var text = "Example text"; + string text = "Example text"; builder.SetText(text); Assert.That(builder.Build().Text, Is.EqualTo(text)); @@ -36,9 +36,9 @@ public void BuilderSetText() public void BuilderSetPreviousSegment() { var builder = new TextSegment.Builder(); - var previousSegment = new TextSegment.Builder().SetText("previous segment text").Build(); + TextSegment previousSegment = new TextSegment.Builder().SetText("previous segment text").Build(); builder.SetPreviousSegment(previousSegment); - var textSegment = builder.Build(); + TextSegment textSegment = builder.Build(); Assert.That(textSegment.PreviousSegment, Is.EqualTo(previousSegment)); Assert.IsNull(textSegment.NextSegment); @@ -53,7 +53,7 @@ public void BuilderAddPrecedingMarker() { var builder = new TextSegment.Builder(); builder.AddPrecedingMarker(UsfmMarkerType.Chapter); - var textSegment = builder.Build(); + TextSegment textSegment = builder.Build(); Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.Chapter); Assert.That(textSegment.MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Chapter])); @@ -76,7 +76,7 @@ public void BuilderSetUsfmToken() { var builder = new TextSegment.Builder(); builder.SetUsfmToken(new UsfmToken("USFM token text")); - var textSegment = builder.Build(); + TextSegment textSegment = builder.Build(); Assert.IsNotNull(textSegment.UsfmToken); Assert.That(textSegment.UsfmToken.Type, Is.EqualTo(UsfmTokenType.Text)); @@ -89,9 +89,9 @@ public void BuilderSetUsfmToken() [Test] public void Equals() { - var basicSegment = new TextSegment.Builder().SetText("text1").Build(); - var sameTextSegment = new TextSegment.Builder().SetText("text1").Build(); - var differentTextSegment = new TextSegment.Builder().SetText("different text").Build(); + TextSegment basicSegment = new TextSegment.Builder().SetText("text1").Build(); + TextSegment sameTextSegment = new TextSegment.Builder().SetText("text1").Build(); + TextSegment differentTextSegment = new TextSegment.Builder().SetText("different text").Build(); #pragma warning disable NUnit2009 // The same value has been provided as both the actual and the expected argument Assert.That(basicSegment, Is.EqualTo(basicSegment)); #pragma warning restore NUnit2009 // The same value has been provided as both the actual and the expected argument @@ -101,27 +101,27 @@ public void Equals() Assert.That(basicSegment, Is.EqualTo(sameTextSegment)); Assert.That(basicSegment, Is.Not.EqualTo(differentTextSegment)); - var segmentWithIndex = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithIndex = new TextSegment.Builder().SetText("text1").Build(); segmentWithIndex.IndexInVerse = 1; - var segmentWithSameIndex = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithSameIndex = new TextSegment.Builder().SetText("text1").Build(); segmentWithSameIndex.IndexInVerse = 1; - var segmentWithDifferentIndex = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithDifferentIndex = new TextSegment.Builder().SetText("text1").Build(); segmentWithDifferentIndex.IndexInVerse = 2; Assert.That(segmentWithIndex, Is.EqualTo(segmentWithSameIndex)); Assert.That(segmentWithIndex, Is.Not.EqualTo(segmentWithDifferentIndex)); Assert.That(segmentWithIndex, Is.Not.EqualTo(basicSegment)); - var segmentWithPrecedingMarker = ( + TextSegment segmentWithPrecedingMarker = ( new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Verse).Build() ); - var segmentWithSamePrecedingMarker = ( + TextSegment segmentWithSamePrecedingMarker = ( new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Verse).Build() ); - var segmentWithDifferentPrecedingMarker = ( + TextSegment segmentWithDifferentPrecedingMarker = ( new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Chapter).Build() ); - var segmentWithMultiplePrecedingMarkers = ( + TextSegment segmentWithMultiplePrecedingMarkers = ( new TextSegment.Builder() .SetText("text1") .AddPrecedingMarker(UsfmMarkerType.Chapter) @@ -130,9 +130,12 @@ public void Equals() ); var usfmToken = new UsfmToken("USFM token text"); - var segmentWithUsfmToken = new TextSegment.Builder().SetText("text1").SetUsfmToken(usfmToken).Build(); - var segmentWithSameUsfmToken = new TextSegment.Builder().SetText("text1").SetUsfmToken(usfmToken).Build(); - var segmentWithDifferentUsfmToken = ( + TextSegment segmentWithUsfmToken = new TextSegment.Builder().SetText("text1").SetUsfmToken(usfmToken).Build(); + TextSegment segmentWithSameUsfmToken = new TextSegment.Builder() + .SetText("text1") + .SetUsfmToken(usfmToken) + .Build(); + TextSegment segmentWithDifferentUsfmToken = ( new TextSegment.Builder().SetText("text1").SetUsfmToken(new UsfmToken("Different USFM token text")).Build() ); @@ -141,11 +144,11 @@ public void Equals() Assert.IsTrue(basicSegment != segmentWithUsfmToken); // attributes that are not used in equality checks - var segmentWithNumVerses = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithNumVerses = new TextSegment.Builder().SetText("text1").Build(); segmentWithNumVerses.NumSegmentsInVerse = 3; - var segmentWithSameNumVerses = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithSameNumVerses = new TextSegment.Builder().SetText("text1").Build(); segmentWithSameNumVerses.NumSegmentsInVerse = 3; - var segmentWithDifferentNumVerses = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithDifferentNumVerses = new TextSegment.Builder().SetText("text1").Build(); segmentWithDifferentNumVerses.NumSegmentsInVerse = 4; Assert.That(segmentWithNumVerses, Is.EqualTo(segmentWithSameNumVerses)); @@ -157,10 +160,10 @@ public void Equals() Assert.That(segmentWithPrecedingMarker, Is.EqualTo(segmentWithMultiplePrecedingMarkers)); Assert.That(segmentWithPrecedingMarker, Is.Not.EqualTo(basicSegment)); - var segmentWithPreviousSegment = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithPreviousSegment = new TextSegment.Builder().SetText("text1").Build(); segmentWithPreviousSegment.PreviousSegment = segmentWithNumVerses; - var segmentWithNextSegment = new TextSegment.Builder().SetText("text1").Build(); + TextSegment segmentWithNextSegment = new TextSegment.Builder().SetText("text1").Build(); segmentWithNextSegment.NextSegment = segmentWithNumVerses; Assert.That(basicSegment, Is.EqualTo(segmentWithPreviousSegment)); @@ -170,7 +173,7 @@ public void Equals() [Test] public void GetText() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); Assert.That(textSegment.Text, Is.EqualTo("example text")); textSegment = new TextSegment.Builder().SetText("new example text").Build(); @@ -180,7 +183,7 @@ public void GetText() [Test] public void Length() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); Assert.That(textSegment.Length, Is.EqualTo("example text".Length)); textSegment = new TextSegment.Builder().SetText("new example text").Build(); @@ -190,7 +193,7 @@ public void Length() [Test] public void SubstringBefore() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); Assert.That(textSegment.SubstringBefore(7), Is.EqualTo("example")); Assert.That(textSegment.SubstringBefore(8), Is.EqualTo("example ")); Assert.That(textSegment.SubstringBefore(0), Is.EqualTo("")); @@ -200,7 +203,7 @@ public void SubstringBefore() [Test] public void SubstringAfter() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); Assert.That(textSegment.SubstringAfter(7), Is.EqualTo(" text")); Assert.That(textSegment.SubstringAfter(8), Is.EqualTo("text")); Assert.That(textSegment.SubstringAfter(0), Is.EqualTo("example text")); @@ -211,12 +214,12 @@ public void SubstringAfter() [Test] public void IsMarkerInPrecedingContext() { - var noPrecedingMarkerSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment noPrecedingMarkerSegment = new TextSegment.Builder().SetText("example text").Build(); Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); - var onePrecedingMarkerTextSegment = ( + TextSegment onePrecedingMarkerTextSegment = ( new TextSegment.Builder().SetText("example text").AddPrecedingMarker(UsfmMarkerType.Character).Build() ); @@ -224,7 +227,7 @@ public void IsMarkerInPrecedingContext() Assert.IsFalse(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); Assert.IsFalse(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); - var twoPrecedingMarkersTextSegment = ( + TextSegment twoPrecedingMarkersTextSegment = ( new TextSegment.Builder() .SetText("example text") .AddPrecedingMarker(UsfmMarkerType.Chapter) @@ -235,7 +238,7 @@ public void IsMarkerInPrecedingContext() Assert.IsTrue(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); Assert.IsFalse(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); - var threePrecedingMarkersTextSegment = ( + TextSegment threePrecedingMarkersTextSegment = ( new TextSegment.Builder() .SetText("example text") .AddPrecedingMarker(UsfmMarkerType.Chapter) @@ -251,7 +254,7 @@ public void IsMarkerInPrecedingContext() [Test] public void IsFirstSegmentInVerse() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); textSegment.IndexInVerse = 0; Assert.IsTrue(textSegment.IsFirstSegmentInVerse()); @@ -262,7 +265,7 @@ public void IsFirstSegmentInVerse() [Test] public void IsLastSegmentInVerse() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); textSegment.IndexInVerse = 0; textSegment.NumSegmentsInVerse = 1; Assert.IsTrue(textSegment.IsLastSegmentInVerse()); @@ -278,7 +281,7 @@ public void IsLastSegmentInVerse() [Test] public void ReplaceSubstring() { - var textSegment = new TextSegment.Builder().SetText("example text").Build(); + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); textSegment.ReplaceSubstring(0, 7, "sample"); Assert.That(textSegment.Text, Is.EqualTo("sample text")); diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs index cc7bea18..6615ec92 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs @@ -41,7 +41,7 @@ public void ChapterAndVerseMarkers() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -74,7 +74,7 @@ public void StartParagraphMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -107,7 +107,7 @@ public void StartCharacterMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -140,7 +140,7 @@ public void EndCharacterMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -173,7 +173,7 @@ public void EndNoteMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -206,7 +206,7 @@ public void EndTableMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -239,7 +239,7 @@ public void RefMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -272,7 +272,7 @@ public void SidebarMarker() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -314,7 +314,7 @@ public void MultipleVerses() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -363,7 +363,7 @@ public void MultipleChapters() ), ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); @@ -404,7 +404,7 @@ public void CharacterMarkerInText() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.That( actualChapters[0].Verses[0].TextSegments[1].PreviousSegment, @@ -451,7 +451,7 @@ public void EmptyText() ) ]; - var actualChapters = usfmStructureExtractor.GetChapters(); + List actualChapters = usfmStructureExtractor.GetChapters(); AssertChapterEqual(expectedChapters, actualChapters); Assert.That( actualChapters[0].Verses[0].TextSegments[1].PreviousSegment, @@ -489,7 +489,7 @@ private class MockUsfmParserState(UsfmStylesheet stylesheet, ScrVers versificati { public void SetVerseNum(int verseNum) { - var vref = VerseRef; + VerseRef vref = VerseRef; vref.VerseNum = verseNum; VerseRef = vref; } From cc00f4ed2f9bc45597efd24a4ced60e2a4219fbe Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 11 Aug 2025 09:14:06 -0400 Subject: [PATCH 28/28] Fix mis-merged file --- .../Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 4cfbeedb..485cb1fc 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -53,11 +53,6 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) || alignmentInfo.Alignment.RowCount == 0 || alignmentInfo.Alignment.ColumnCount == 0 || !elements.Any(e => -<<<<<<< HEAD - e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) - && !e.MarkedForRemoval - && e.Tokens.Count == 1 -======= ( e.Type == UsfmUpdateBlockElementType.Paragraph && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve @@ -67,7 +62,6 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) e.Type == UsfmUpdateBlockElementType.Style && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve ) ->>>>>>> 55d91e41 (Port add metadata to update block and marker behavior metadata) ) ) {