Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f12c877
One complete chunk of porting
Enkidu93 Jul 7, 2025
4cd7a59
Another complete block of porting
Enkidu93 Jul 8, 2025
4388da4
All of analysis folder
Enkidu93 Jul 9, 2025
d715caa
Another batch
Enkidu93 Jul 11, 2025
8465042
Fix constructor
Enkidu93 Jul 14, 2025
11dbf25
Port some reviewer changes; port some tests
Enkidu93 Jul 21, 2025
1d49973
Passing tests + complete porting
Enkidu93 Jul 28, 2025
e73cb16
Undo small change - passing tests
Enkidu93 Jul 28, 2025
29cd82e
Port more changes
Enkidu93 Jul 28, 2025
d209551
Port better guessing for ambiguous quotation marks
Enkidu93 Jul 28, 2025
347e14c
Remove TODOs
Enkidu93 Jul 28, 2025
6e4798b
Passing tests
Enkidu93 Jul 28, 2025
72d82a9
Add regions
Enkidu93 Jul 28, 2025
e7bd322
Port Ben's most recent test-related changes
Enkidu93 Jul 31, 2025
832cd6a
Fix typo
Enkidu93 Jul 31, 2025
837488d
Add paratext zip quotation convention detector
Enkidu93 Aug 2, 2025
fb2ce79
Make convention detector operable on multiple zips
Enkidu93 Aug 4, 2025
b0f6d92
Rename function
Enkidu93 Aug 5, 2025
27d8573
Fix remark adding when textBehavior is PreferExisting
Enkidu93 Aug 5, 2025
0b8c14b
Port add metadata to update block and marker behavior metadata
Enkidu93 Aug 6, 2025
480619e
Move PunctuationAnalysis out of Corpora
Enkidu93 Aug 6, 2025
26a96b6
Respond to reviewer comments
Enkidu93 Aug 6, 2025
98d89b7
Fix typo
Enkidu93 Aug 6, 2025
6984ad9
Remove debugging line
Enkidu93 Aug 6, 2025
5788e3a
Make standard qcs static
Enkidu93 Aug 6, 2025
a92dae8
Use PCRE.NET to mirror python regexes
Enkidu93 Aug 7, 2025
61428af
Use explicit type rather than var
Enkidu93 Aug 7, 2025
cc00f4e
Fix mis-merged file
Enkidu93 Aug 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
using System.Collections.Generic;
using System.Linq;
using SIL.Machine.PunctuationAnalysis;

namespace SIL.Machine.Corpora
{
public class FallbackQuotationMarkResolver : IQuotationMarkResolver
{
private readonly IQuotationMarkResolutionSettings _settings;
public QuotationMarkMetadata LastQuotationMark { get; set; }
public HashSet<QuotationMarkResolutionIssue> Issues { get; }

public FallbackQuotationMarkResolver(IQuotationMarkResolutionSettings settings)
{
_settings = settings;
LastQuotationMark = null;
Issues = new HashSet<QuotationMarkResolutionIssue>();
}

public void Reset()
{
LastQuotationMark = null;
Issues.Clear();
}

public IEnumerable<QuotationMarkMetadata> ResolveQuotationMarks(
IReadOnlyList<QuotationMarkStringMatch> quotationMarkMatches
)
{
foreach (QuotationMarkStringMatch quoteMatch in quotationMarkMatches)
{
foreach (QuotationMarkMetadata quotationMarkMetadata in ResolveQuotationMark(quoteMatch))
{
yield return quotationMarkMetadata;
}
}
}

public IEnumerable<QuotationMarkMetadata> ResolveQuotationMark(QuotationMarkStringMatch quotationMarkMatch)
{
if (IsOpeningQuotationMark(quotationMarkMatch))
{
QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch);
if (quotationMark != null)
{
yield return quotationMark;
}
else
{
Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark);
}
}
else if (IsClosingQuotationMark(quotationMarkMatch))
{
QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch);
if (quotationMark != null)
{
yield return quotationMark;
}
else
{
Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark);
}
}
else
{
// Make a reasonable guess about the direction of the quotation mark
if (LastQuotationMark == null || LastQuotationMark.Direction == QuotationMarkDirection.Closing)
{
QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch);
if (quotationMark != null)
yield return quotationMark;
}
else
{
QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch);
if (quotationMark != null)
yield return quotationMark;
}
Issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark);
}
}

public bool IsOpeningQuotationMark(QuotationMarkStringMatch match)
{
if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match))
{
return (
match.IsAtStartOfSegment
|| match.HasLeadingWhitespace()
|| DoesMostRecentOpeningMarkImmediatelyPrecede(match)
|| match.HasQuoteIntroducerInLeadingSubstring()
) && !(match.HasTrailingWhitespace() || match.HasTrailingPunctuation());
}
else if (_settings.IsValidOpeningQuotationMark(match))
{
return true;
}

return false;
}

public bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatch match)
{
if (LastQuotationMark == null || LastQuotationMark.Direction != QuotationMarkDirection.Opening)
{
return false;
}
return LastQuotationMark.TextSegment.Equals(match.TextSegment)
&& LastQuotationMark.EndIndex == match.StartIndex;
}

public bool IsClosingQuotationMark(QuotationMarkStringMatch match)
{
if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match))
{
return (match.HasTrailingWhitespace() || match.HasTrailingPunctuation() || match.IsAtEndOfSegment)
&& !match.HasLeadingWhitespace();
}
else if (_settings.IsValidClosingQuotationMark(match))
{
return true;
}

return false;
}

public QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotationMarkMatch)
{
HashSet<int> possibleDepths = _settings.GetPossibleDepths(
quotationMarkMatch.QuotationMark,
QuotationMarkDirection.Opening
);
if (possibleDepths.Count == 0)
return null;

QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve(
possibleDepths.Min(),
QuotationMarkDirection.Opening
);
LastQuotationMark = quotationMark;
return quotationMark;
}

public QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotationMarkMatch)
{
HashSet<int> possibleDepths = _settings.GetPossibleDepths(
quotationMarkMatch.QuotationMark,
QuotationMarkDirection.Closing
);
if (possibleDepths.Count == 0)
return null;

QuotationMarkMetadata quote = quotationMarkMatch.Resolve(
possibleDepths.Min(),
QuotationMarkDirection.Closing
);
LastQuotationMark = quote;
return quote;
}

public HashSet<QuotationMarkResolutionIssue> GetIssues()
{
return Issues;
}
}
}
56 changes: 56 additions & 0 deletions src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
using System;
using System.IO;
using System.Text;
using SIL.Machine.PunctuationAnalysis;

namespace SIL.Machine.Corpora
{
public abstract class ParatextProjectQuoteConventionDetector
{
private readonly ParatextProjectSettings _settings;

protected ParatextProjectQuoteConventionDetector(ParatextProjectSettings settings)
{
_settings = settings;
}

protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBase settingsParser)
{
_settings = settingsParser.Parse();
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
{
handler = handler ?? new QuoteConventionDetector();
foreach (string fileName in _settings.GetAllScriptureBookFileNames())
{
if (!Exists(fileName))
continue;

string usfm;
using (var reader = new StreamReader(Open(fileName)))
{
usfm = reader.ReadToEnd();
}

try
{
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
}
catch (Exception ex)
{
var sb = new StringBuilder();
sb.Append($"An error occurred while parsing the usfm for '{fileName}`");
if (!string.IsNullOrEmpty(_settings.Name))
sb.Append($" in project '{_settings.Name}'");
sb.Append($". Error: '{ex.Message}'");
throw new InvalidOperationException(sb.ToString(), ex);
}
}
return handler.DetectQuotationConvention();
}

protected abstract bool Exists(string fileName);
protected abstract Stream Open(string fileName);
}
}
13 changes: 12 additions & 1 deletion src/SIL.Machine/Corpora/ParatextProjectSettings.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Globalization;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using SIL.Scripture;

Expand Down Expand Up @@ -103,6 +104,16 @@ public string GetBookFileName(string bookId)
return FileNamePrefix + bookPart + FileNameSuffix;
}

public IEnumerable<string> GetAllScriptureBookFileNames()
{
BookSet scriptureBooks = Canon.ScriptureBooks;
scriptureBooks.SelectAll();
foreach (string bookId in scriptureBooks.SelectedBookIds)
{
yield return GetBookFileName(bookId);
}
}

private static string GetBookFileNameDigits(string bookId)
{
int bookNum = Canon.BookIdToNumber(bookId);
Expand Down
2 changes: 1 addition & 1 deletion src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase setti

public string UpdateUsfm(
string bookId,
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
IReadOnlyList<UpdateUsfmRow> rows,
string fullName = null,
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve,
Expand Down
49 changes: 33 additions & 16 deletions src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,49 +8,60 @@ namespace SIL.Machine.Corpora
{
public class PlaceMarkersAlignmentInfo
{
public IReadOnlyList<string> Refs { get; }
public const string MetadataKey = "alignment_info";

public IReadOnlyList<string> SourceTokens { get; }
public IReadOnlyList<string> TranslationTokens { get; }
public WordAlignmentMatrix Alignment { get; }
public UpdateUsfmMarkerBehavior ParagraphBehavior { get; }
public UpdateUsfmMarkerBehavior StyleBehavior { get; }

public PlaceMarkersAlignmentInfo(
IReadOnlyList<string> refs,
IReadOnlyList<string> sourceTokens,
IReadOnlyList<string> translationTokens,
WordAlignmentMatrix alignment
WordAlignmentMatrix alignment,
UpdateUsfmMarkerBehavior paragraphBehavior,
UpdateUsfmMarkerBehavior styleBehavior
)
{
Refs = refs;
SourceTokens = sourceTokens;
TranslationTokens = translationTokens;
Alignment = alignment;
ParagraphBehavior = paragraphBehavior;
StyleBehavior = styleBehavior;
}
}

public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler
{
private readonly IDictionary<string, PlaceMarkersAlignmentInfo> _alignmentInfo;

public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable<PlaceMarkersAlignmentInfo> alignmentInfo)
{
_alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info);
}

public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
{
string reference = block.Refs.FirstOrDefault().ToString();
var elements = block.Elements.ToList();

// Nothing to do if there are no markers to place or no alignment to use
if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject))
{
return block;
}
if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo))
{
return block;
}
if (
elements.Count == 0
|| !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo)
|| alignmentInfo.Alignment.RowCount == 0
|| alignmentInfo.Alignment.ColumnCount == 0
|| !elements.Any(e =>
e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)
&& !e.MarkedForRemoval
&& e.Tokens.Count == 1
(
e.Type == UsfmUpdateBlockElementType.Paragraph
&& alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve
&& e.Tokens.Count == 1
)
|| (
e.Type == UsfmUpdateBlockElementType.Style
&& alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve
)
)
)
{
Expand Down Expand Up @@ -112,7 +123,13 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
{
if (element.Type == UsfmUpdateBlockElementType.Text)
{
if (element.MarkedForRemoval)
if (
element.MarkedForRemoval
|| (
element.Type == UsfmUpdateBlockElementType.Paragraph
&& alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip
)
)
{
string text = element.Tokens[0].ToUsfm();
sourceSentence += text;
Expand Down
14 changes: 14 additions & 0 deletions src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using SIL.Machine.PunctuationAnalysis;

namespace SIL.Machine.Corpora
{
// This is a convenience class so that users don't have to know to normalize the source quote convention
public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass
{
public QuotationMarkDenormalizationFirstPass(
QuoteConvention sourceQuoteConvention,
QuoteConvention targetQuoteConvention
)
: base(sourceQuoteConvention.Normalize(), targetQuoteConvention) { }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using SIL.Machine.PunctuationAnalysis;

namespace SIL.Machine.Corpora
{
public class QuotationMarkDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler
{
// This is a convenience class so that users don't have to know to normalize the source quote convention
public QuotationMarkDenormalizationUsfmUpdateBlockHandler(
QuoteConvention sourceQuoteConvention,
QuoteConvention targetQuoteConvention,
QuotationMarkUpdateSettings settings = null
)
: base(
sourceQuoteConvention.Normalize(),
targetQuoteConvention,
settings ?? new QuotationMarkUpdateSettings()
) { }
}
}
Loading
Loading