diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..193af5b2 --- /dev/null +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -0,0 +1,360 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Extensions; +using SIL.Machine.Translation; + +namespace SIL.Machine.Corpora +{ + public class PlaceMarkersAlignmentInfo + { + public IReadOnlyList Refs { get; } + public IReadOnlyList SourceTokens { get; } + public IReadOnlyList TranslationTokens { get; } + public WordAlignmentMatrix Alignment { get; } + + public PlaceMarkersAlignmentInfo( + IReadOnlyList refs, + IReadOnlyList sourceTokens, + IReadOnlyList translationTokens, + WordAlignmentMatrix alignment + ) + { + Refs = refs; + SourceTokens = sourceTokens; + TranslationTokens = translationTokens; + Alignment = alignment; + } + } + + public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler + { + private readonly IDictionary _alignmentInfo; + + public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable alignmentInfo) + { + _alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info); + } + + public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + { + string reference = block.Refs.FirstOrDefault().ToString(); + var elements = block.Elements.ToList(); + + // Nothing to do if there are no markers to place or no alignment to use + if ( + elements.Count == 0 + || !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo) + || alignmentInfo.Alignment.RowCount == 0 + || alignmentInfo.Alignment.ColumnCount == 0 + || !elements.Any(e => + e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) + && !e.MarkedForRemoval + ) + ) + { + return block; + } + + // Paragraph markers at the end of the block should stay there + // Section headers should be ignored but re-inserted in the same position relative to other paragraph markers + var endElements = new List(); + bool eobEmptyParas = true; + var headerElements = new List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)>(); + int paraMarkersLeft = 0; + foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse()) + { + if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval) + { + if (element.Tokens.Count > 1) + { + headerElements.Insert(0, (paraMarkersLeft, element)); + elements.RemoveAt(i); + } + else + { + paraMarkersLeft++; + + if (eobEmptyParas) + { + endElements.Insert(0, element); + elements.RemoveAt(i); + } + } + } + else if ( + !( + element.Type == UsfmUpdateBlockElementType.Embed + || ( + element.Type == UsfmUpdateBlockElementType.Text + && element.Tokens[0].ToUsfm().Trim().Length == 0 + ) + ) + ) + { + eobEmptyParas = false; + } + } + + IReadOnlyList sourceTokens = alignmentInfo.SourceTokens; + IReadOnlyList targetTokens = alignmentInfo.TranslationTokens; + int sourceTokenIndex = 0; + + string sourceSentence = ""; + string targetSentence = ""; + var toPlace = new List(); + var adjacentSourceTokens = new List(); + var placedElements = new List(); + var embedElements = new List(); + var ignoredElements = new List(); + foreach (UsfmUpdateBlockElement element in elements) + { + if (element.Type == UsfmUpdateBlockElementType.Text) + { + if (element.MarkedForRemoval) + { + string text = element.Tokens[0].ToUsfm(); + sourceSentence += text; + + // Track seen tokens + while (sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex])) + { + text = text.Substring( + text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length + ); + sourceTokenIndex++; + } + // Handle tokens split across text elements + if (text.Trim().Length > 0) + sourceTokenIndex++; + } + else + { + targetSentence += element.Tokens[0].ToUsfm(); + } + } + + if (element.MarkedForRemoval) + { + ignoredElements.Add(element); + } + else if (element.Type == UsfmUpdateBlockElementType.Embed) + { + embedElements.Add(element); + } + else if (element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)) + { + toPlace.Add(element); + adjacentSourceTokens.Add(sourceTokenIndex); + } + } + + if (targetSentence.Trim().Length == 0) + return block; + + var targetTokenStarts = new List(); + int prevLength = 0; + foreach (string token in targetTokens) + { + targetTokenStarts.Add(targetSentence.IndexOf(token, targetTokenStarts.LastOrDefault() + prevLength)); + prevLength = token.Length; + } + + var toInsert = new List<(int Index, UsfmUpdateBlockElement Element)>(); + foreach ( + (UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace + .Zip(adjacentSourceTokens) + .Select(tuple => (tuple.Item1, tuple.Item2)) + ) + { + int adjacentTargetToken = PredictMarkerLocation( + alignmentInfo.Alignment, + adjacentSourceToken, + sourceTokens, + targetTokens + ); + int targetStringIndex = + adjacentTargetToken < targetTokenStarts.Count + ? targetTokenStarts[adjacentTargetToken] + : targetSentence.Length; + toInsert.Add((targetStringIndex, element)); + } + toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index)); + toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e))); + + // Construct new text tokens to put between markers + // and reincorporate headers and empty end-of-verse paragraph markers + if (toInsert[0].Index > 0) + { + placedElements.Add( + new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + new List() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) } + ) + ); + } + + foreach ((int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p))) + { + if (element.Type == UsfmUpdateBlockElementType.Paragraph) + { + while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft) + { + placedElements.Add(headerElements[0].Element); + headerElements.RemoveAt(0); + } + paraMarkersLeft--; + } + + placedElements.Add(element); + if ( + insertIndex < targetSentence.Length + && (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index) + ) + { + UsfmToken textToken; + if (j + 1 < toInsert.Count) + { + textToken = new UsfmToken( + targetSentence.Substring(insertIndex, toInsert[j + 1].Index - insertIndex) + ); + } + else + { + textToken = new UsfmToken(targetSentence.Substring(insertIndex)); + } + placedElements.Add( + new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, new List { textToken }) + ); + } + } + while (headerElements.Count > 0) + { + placedElements.Add(headerElements[0].Element); + headerElements.RemoveAt(0); + } + + var processedBlock = new UsfmUpdateBlock( + refs: block.Refs, + elements: placedElements.Concat(ignoredElements) + ); + return processedBlock; + } + + private int PredictMarkerLocation( + WordAlignmentMatrix alignment, + int adjacentSourceToken, + IReadOnlyList sourceTokens, + IReadOnlyList targetTokens + ) + { + // Gets the number of alignment pairs that "cross the line" between + // the src marker position and the potential trg marker position, (src_idx - .5) and (trg_idx - .5) + int NumAlignCrossings(int sourceIndex, int targetIndex) + { + int crossings = 0; + for (int i = 0; i < alignment.RowCount; i++) + { + for (int j = 0; j < alignment.ColumnCount; j++) + { + if ( + alignment[i, j] + && ((i < sourceIndex && j >= targetIndex) || (i >= sourceIndex && j < targetIndex)) + ) + { + crossings++; + } + } + } + return crossings; + } + + // If the token on either side of a potential target location is punctuation, + // use it as the basis for deciding the target marker location + int targetHypothesis = -1; + int[] punctuationHypotheses = new int[] { -1, 0 }; + foreach (int punctuationHypothesis in punctuationHypotheses) + { + int sourceHypothesis = adjacentSourceToken + punctuationHypothesis; + if (sourceHypothesis < 0 || sourceHypothesis >= sourceTokens.Count) + { + continue; + } + // Only accept aligned pairs where both the src and trg token are punctuation + string hypothesisToken = sourceTokens[sourceHypothesis]; + if ( + hypothesisToken.Length > 0 + && !hypothesisToken.Any(char.IsLetter) + && sourceHypothesis < alignment.RowCount + ) + { + List alignedTargetTokens = alignment.GetRowAlignedIndices(sourceHypothesis).ToList(); + // If aligning to a token that precedes that marker, + // the trg token predicted to be closest to the marker + // is the last token aligned to the src rather than the first + if (punctuationHypothesis < 0) + alignedTargetTokens.Reverse(); + foreach (int targetIndex in alignedTargetTokens) + { + string targetToken = targetTokens[targetIndex]; + if (targetToken.Length > 0 && !targetToken.Any(char.IsLetter)) + { + targetHypothesis = targetIndex; + break; + } + } + } + if (targetHypothesis != -1) + { + // Since the marker location is represented by the token after the marker, + // adjust the index when aligning to punctuation that precedes the token + return targetHypothesis + (punctuationHypothesis == -1 ? 1 : 0); + } + } + + int[] hypotheses = new int[] { 0, 1, 2 }; + int bestHypothesis = -1; + int bestNumCrossings = 200 ^ 2; + var checkedHypotheses = new HashSet(); + foreach (int hypothesis in hypotheses) + { + int sourceHypothesis = adjacentSourceToken + hypothesis; + if (checkedHypotheses.Contains(sourceHypothesis)) + continue; + targetHypothesis = -1; + while (targetHypothesis == -1 && sourceHypothesis >= 0 && sourceHypothesis < alignment.RowCount) + { + checkedHypotheses.Add(sourceHypothesis); + List alignedTargetTokens = alignment.GetRowAlignedIndices(sourceHypothesis).ToList(); + if (alignedTargetTokens.Count > 0) + { + // If aligning with a source token that precedes the marker, + // the target token predicted to be closest to the marker is the last aligned token rather than the first + targetHypothesis = alignedTargetTokens[hypothesis < 0 ? -1 : 0]; + } + else + { + // continue the search outwards + sourceHypothesis += hypothesis < 0 ? -1 : 1; + } + } + if (targetHypothesis != -1) + { + int numCrossings = NumAlignCrossings(adjacentSourceToken, targetHypothesis); + if (numCrossings < bestNumCrossings) + { + bestHypothesis = targetHypothesis; + bestNumCrossings = numCrossings; + } + if (numCrossings == 0) + { + break; + } + } + } + + // If no alignments found, insert at the end of the sentence + return bestHypothesis != -1 ? bestHypothesis : targetTokens.Count; + } + } +} diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index ae28245f..0da338ef 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -119,13 +119,17 @@ public override void StartPara( IReadOnlyList attributes ) { - if ( - state.IsVerseText - && (HasNewText() || _textBehavior == UpdateUsfmTextBehavior.StripExisting) - && _paragraphBehavior == UpdateUsfmMarkerBehavior.Strip - ) + if (state.IsVerseText) { - SkipUpdatableTokens(state); + // Only strip paragraph markers in a verse + if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve) + { + CollectUpdatableTokens(state); + } + else + { + SkipUpdatableTokens(state); + } } else { @@ -201,6 +205,16 @@ string pubNumber { UseUpdatedText(); + // Ensure that a paragraph that contains a verse is not marked for removal + if (_updateBlocks.Count > 0) + { + UsfmUpdateBlockElement lastParagraph = _updateBlocks.Peek().GetLastParagraph(); + if (lastParagraph != null) + { + lastParagraph.MarkedForRemoval = false; + } + } + base.Verse(state, number, marker, altNumber, pubNumber); CollectReadonlyTokens(state); @@ -317,12 +331,13 @@ public override void Unmatched(UsfmParserState state, string marker) protected override void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { + CollectUpdatableTokens(state); StartUpdateBlock(scriptureRefs); } protected override void EndVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { - EndUpdateBlock(scriptureRefs); + EndUpdateBlock(state, scriptureRefs); } protected override void StartNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) @@ -332,7 +347,7 @@ protected override void StartNonVerseText(UsfmParserState state, ScriptureRef sc protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { - EndUpdateBlock(new[] { scriptureRef }); + EndUpdateBlock(state, new[] { scriptureRef }); } protected override void EndEmbedText(UsfmParserState state, ScriptureRef scriptureRef) @@ -445,10 +460,10 @@ private void SkipUpdatableTokens(UsfmParserState state) { _updateBlocks.Peek().AddToken(token, markedForRemoval: true); } - _tokenIndex++; } + _tokenIndex++; } - _tokenIndex = state.Index + state.SpecialTokenCount + 1; + _tokenIndex = state.Index + 1 + state.SpecialTokenCount; } private bool ReplaceWithNewTokens(UsfmParserState state) @@ -498,26 +513,39 @@ private void StartUpdateBlock(IReadOnlyList scriptureRefs) PushUpdatedText(rowTexts.Select(t => new UsfmToken(t + " "))); } - private void EndUpdateBlock(IReadOnlyList scriptureRefs) + private void EndUpdateBlock(UsfmParserState state, IReadOnlyList scriptureRefs) { UseUpdatedText(); PopNewTokens(); UsfmUpdateBlock updateBlock = _updateBlocks.Pop(); updateBlock.UpdateRefs(scriptureRefs); + + // Strip off any non-verse paragraphs that are at the end of the update block + var paraElems = new List(); + while (updateBlock.Elements.Count > 0 && IsNonverseParagraph(state, updateBlock.Elements.Last())) + { + paraElems.Add(updateBlock.Pop()); + } + foreach (IUsfmUpdateBlockHandler handler in _updateBlockHandlers) { updateBlock = handler.ProcessBlock(updateBlock); } + List tokens = updateBlock.GetTokens(); + foreach (UsfmUpdateBlockElement elem in Enumerable.Reverse(paraElems)) + { + tokens.AddRange(elem.GetTokens()); + } if ( _updateBlocks.Count > 0 && _updateBlocks.Peek().Elements.Last().Type == UsfmUpdateBlockElementType.Paragraph ) { - _updateBlocks.Peek().ExtendLastElement(updateBlock.GetTokens()); + _updateBlocks.Peek().ExtendLastElement(tokens); } else { - _tokens.AddRange(updateBlock.GetTokens()); + _tokens.AddRange(tokens); } } @@ -549,5 +577,16 @@ private bool IsInPreservedParagraph(UsfmParserState state) { return state.ParaTag != null && _preserveParagraphStyles.Contains(state.ParaTag.Marker); } + + private bool IsNonverseParagraph(UsfmParserState state, UsfmUpdateBlockElement element) + { + if (element.Type != UsfmUpdateBlockElementType.Paragraph) + return false; + UsfmToken paraToken = element.Tokens[0]; + if (paraToken.Marker is null) + return false; + UsfmTag paraTag = state.Stylesheet.GetTag(paraToken.Marker); + return paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != UsfmTextType.NotSpecified; + } } } diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 88ade395..1b0952f2 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -172,7 +172,7 @@ public bool IsVerseText foreach (UsfmTag charTag in CharTags) { // Not specified text type is verse text - if (charTag.TextType != UsfmTextType.VerseText && charTag.TextType != 0) + if (charTag.TextType != UsfmTextType.VerseText && charTag.TextType != UsfmTextType.NotSpecified) return false; } diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 9a16b102..117cfe74 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -246,9 +246,12 @@ public override void Text(UsfmParserState state, string text) { if (!text.IsWhiteSpace()) { - foreach (UsfmToken token in _nextParaTokens) - rowText.Append(token); - _nextParaTokens.Clear(); + if (CurrentTextType == ScriptureTextType.Verse) + { + foreach (UsfmToken token in _nextParaTokens) + rowText.Append(token.ToString() + " "); + _nextParaTokens.Clear(); + } _nextParaTextStarted = true; } if (rowText.Length == 0 || char.IsWhiteSpace(rowText[rowText.Length - 1])) @@ -280,6 +283,13 @@ protected override void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { string text = _rowTexts.Pop().ToString(); + if (_text._includeMarkers) + { + foreach (UsfmToken token in _nextParaTokens) + { + text += token.ToString() + " "; + } + } _rows.AddRange(_text.CreateRows(scriptureRefs, text, _sentenceStart)); _sentenceStart = state.Token.Marker == "c" || text.HasSentenceEnding(); } diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs index df64a4d6..6640e96a 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs @@ -71,6 +71,23 @@ public void UpdateRefs(IEnumerable refs) _refs.AddRange(refs); } + public UsfmUpdateBlockElement GetLastParagraph() + { + foreach (UsfmUpdateBlockElement element in Enumerable.Reverse(_elements)) + { + if (element.Type == UsfmUpdateBlockElementType.Paragraph) + return element; + } + return null; + } + + public UsfmUpdateBlockElement Pop() + { + UsfmUpdateBlockElement element = _elements.Last(); + _elements.RemoveAt(_elements.Count - 1); + return element; + } + public List GetTokens() { return _elements.SelectMany(e => e.GetTokens()).ToList(); diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs index cb5a3131..a407af34 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs @@ -15,7 +15,7 @@ public class UsfmUpdateBlockElement { public UsfmUpdateBlockElementType Type { get; } public List Tokens { get; } - public bool MarkedForRemoval { get; } + public bool MarkedForRemoval { get; set; } public UsfmUpdateBlockElement( UsfmUpdateBlockElementType type, diff --git a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs new file mode 100644 index 00000000..a50e67c0 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs @@ -0,0 +1,694 @@ +namespace SIL.Machine.Corpora; + +using NUnit.Framework; +using SIL.Machine.Tokenization; +using SIL.Machine.Translation; + +[TestFixture] +public class PlaceMarkersUsfmUpdateBlockHandlerTests +{ + private static readonly LatinWordTokenizer Tokenizer = new LatinWordTokenizer(); + + [Test] + public void UpdateUsfm_ParagraphMarkers() + { + string source = "This is the first paragraph. This text is in English, and this test is for paragraph markers."; + string pretranslation = + "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo."; + IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), pretranslation)]; + string usfm = + @"\id MAT +\c 1 +\v 1 This is the first paragraph. +\p This text is in English, +\p and this test is for paragraph markers. +"; + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ) + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 Este es el primer párrafo. +\p Este texto está en inglés +\p y esta prueba es para marcadores de párrafo. +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_StyleMarkers() + { + string source = "This is the first sentence. This text is in English, and this test is for style markers."; + string pretranslation = + "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo."; + IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), pretranslation)]; + string usfm = + @"\id MAT +\c 1 +\v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. +"; + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ) + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo. +"; + + // NOTE: the spacing before/after end markers is incorrect, + // but this is an issue with how the is USFM is generated from the tokens + AssertUsfmEquals(target, result); + + target = UpdateUsfm( + rows, + usfm, + styleBehavior: UpdateUsfmMarkerBehavior.Strip, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + result = + @"\id MAT +\c 1 +\v 1 Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo. +"; + + AssertUsfmEquals(target, result); + } + + // NOTE: Not currently updating embeds, will need to change test when we do + [Test] + public void UpdateUsfm_EmbedMarkers() + { + IReadOnlyList<(IReadOnlyList, string)> rows = + [ + (ScrRef("MAT 1:1"), "New verse 1"), + (ScrRef("MAT 1:2"), "New verse 2"), + (ScrRef("MAT 1:3"), "New verse 3"), + (ScrRef("MAT 1:4"), "New verse 4"), + (ScrRef("MAT 1:4/1:f"), "New embed text"), + (ScrRef("MAT 1:5"), "New verse 5"), + (ScrRef("MAT 1:6"), "New verse 6"), + (ScrRef("MAT 1:6/1:f"), "New verse 6 embed text") + ]; + string usfm = + @"\id MAT +\c 1 +\v 1 \f \fr 1.1 \ft Some note \f*Start of sentence embed +\v 2 Middle of sentence \f \fr 1.2 \ft Some other note \f*embed +\v 3 End of sentence embed\f \fr 1.3 \ft A third note \f* +\v 4 Updated embed\f \fr 1.4 \ft A fourth note \f* +\v 5 Embed with style markers \f \fr 1.5 \ft A \+w stylish\+w* note \f* +\v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* +"; + IReadOnlyList alignInfo = []; + + string target = UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 New verse 1 \f \fr 1.1 \ft Some note \f* +\v 2 New verse 2 \f \fr 1.2 \ft Some other note \f* +\v 3 New verse 3 \f \fr 1.3 \ft A third note \f* +\v 4 New verse 4 \f \fr 1.4 \ft A fourth note \f* +\v 5 New verse 5 \f \fr 1.5 \ft A \+w stylish\+w* note \f* +\v 6 New verse 6 \f \fr 1.6 \ft Another \+w stylish\+w* note \f* +"; + + AssertUsfmEquals(target, result); + + target = UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Strip, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + result = + @"\id MAT +\c 1 +\v 1 New verse 1 +\v 2 New verse 2 +\v 3 New verse 3 +\v 4 New verse 4 +\v 5 New verse 5 +\v 6 New verse 6 +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_TrailingEmptyParagraphs() + { + IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "New verse 1")]; + string usfm = + @"\id MAT +\c 1 +\v 1 \f embed 1 \f*Verse 1 +\p +\b +\q1 \f embed 2 \f* +"; + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["Verse", "1"], + translationTokens: ["New", "verse", "1"], + alignment: ToWordAlignmentMatrix("0-1 1-2") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 New verse 1 \f embed 1 \f*\f embed 2 \f* +\p +\b +\q1 +"; + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_Headers() + { + IReadOnlyList<(IReadOnlyList, string)> rows = + [ + (ScrRef("MAT 1:1"), "X Y Z"), + (ScrRef("MAT 1:2"), "X"), + (ScrRef("MAT 1:3"), "Y"), + (ScrRef("MAT 1:3/1:s1"), "Updated header") + ]; + string usfm = + @"\id MAT +\c 1 +\s1 Start of chapter header +\p +\v 1 A +\p B +\s1 Mid-verse header +\p C +\s1 Header between verse text and empty end-of-verse paragraphs +\p +\p +\p +\s1 Header after all verse paragraphs +\p +\v 2 A +\s1 Header followed by a reference +\r (reference) +\p +\v 3 B +\s1 Header to be updated +"; + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["A", "B", "C"], + translationTokens: ["X", "Y", "Z"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2") + ), + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:2"], + sourceTokens: ["A"], + translationTokens: ["X"], + alignment: ToWordAlignmentMatrix("0-0") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\s1 Start of chapter header +\p +\v 1 X +\p Y +\s1 Mid-verse header +\p Z +\s1 Header between verse text and empty end-of-verse paragraphs +\p +\p +\p +\s1 Header after all verse paragraphs +\p +\v 2 X +\s1 Header followed by a reference +\r (reference) +\p +\v 3 Y +\s1 Updated header +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_ConsecutiveMarkers() + { + IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "New verse 1 WORD"),]; + string usfm = + @"\id MAT +\c 1 +\v 1 Old verse 1 +\p \qt \+w word \+w* \qt* +"; + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["Old", "verse", "1", "word"], + translationTokens: ["New", "verse", "1", "WORD"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 New verse 1 +\p \qt \+w WORD \+w*\qt* +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_VerseRanges() + { + IReadOnlyList<(IReadOnlyList, string)> rows = + [ + ( + Enumerable.Range(1, 6).Select(i => ScriptureRef.Parse($"MAT 1:{i}")).ToList(), + "New verse range text new paragraph 2" + ) + ]; + string usfm = + @"\id MAT +\c 1 +\v 1-5 Verse range +\p old paragraph 2 +"; + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: Enumerable.Range(1, 6).Select(i => ScriptureRef.Parse($"MAT 1:{i}").ToString()).ToList(), + sourceTokens: ["Verse", "range", "old", "paragraph", "2"], + translationTokens: ["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5 4-6") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1-5 New verse range text +\p new paragraph 2 +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_NoUpdate() + { + IReadOnlyList<(IReadOnlyList, string)> rows = + [ + (ScrRef("MAT 1:1"), "New paragraph 1 New paragraph 2"), + ]; + string usfm = + @"\id MAT +\c 1 +\v 1 Old paragraph 1 +\p Old paragraph 2 +"; + + //Strip paragraphs + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["Old", "paragraph", "1", "Old", "paragraph", "2"], + translationTokens: ["New", "paragraph", "1", "New", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +"; + + AssertUsfmEquals(target, result); + + //No alignment + alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: [], + translationTokens: [], + alignment: ToWordAlignmentMatrix("") + ) + ]; + + target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + result = + @"\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +\p +"; + + AssertUsfmEquals(target, result); + + // No text update + rows = []; + alignInfo = []; + target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + result = + @"\id MAT +\c 1 +\v 1 Old paragraph 1 +\p Old paragraph 2 +"; + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_SplitTokens() + { + IReadOnlyList<(IReadOnlyList, string)> rows = + [ + (ScrRef("MAT 1:1"), "words split words split words split"), + ]; + string usfm = + @"\id MAT +\c 1 +\v 1 words spl +\p it words spl +\p it words split +"; + + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["words", "split", "words", "split", "words", "split"], + translationTokens: ["words", "split", "words", "split", "words", "split"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 words split +\p words split +\p words split +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_NoText() + { + IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), ""),]; + string usfm = + @"\id MAT +\c 1 +\v 1 \w \w* +"; + + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: [], + translationTokens: [], + alignment: ToWordAlignmentMatrix("") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 \w \w* +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_ConsecutiveSubstring() + { + IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "string ring"),]; + string usfm = + @"\id MAT +\c 1 +\v 1 string +\p ring +"; + + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["string", "ring"], + translationTokens: ["string", "ring"], + alignment: ToWordAlignmentMatrix("0-0 1-1") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 1 string +\p ring +"; + + AssertUsfmEquals(target, result); + } + + [Test] + public void UpdateUsfm_VersesOutOfOrder() + { + IReadOnlyList<(IReadOnlyList, string)> rows = + [ + (ScrRef("MAT 1:1"), "new verse 1 new paragraph 2"), + (ScrRef("MAT 1:2"), "new verse 2") + ]; + string usfm = + @"\id MAT +\c 1 +\v 2 verse 2 +\v 1 verse 1 +\p paragraph 2 +"; + + IReadOnlyList alignInfo = + [ + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:1"], + sourceTokens: ["verse", "1", "paragraph", "2"], + translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5") + ), + new PlaceMarkersAlignmentInfo( + refs: ["MAT 1:2"], + sourceTokens: ["verse", "2"], + translationTokens: ["new", "verse", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2") + ) + ]; + + string target = UpdateUsfm( + rows, + usfm, + textBehavior: UpdateUsfmTextBehavior.StripExisting, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + ); + + string result = + @"\id MAT +\c 1 +\v 2 new verse 2 +\v 1 +\p +"; + + AssertUsfmEquals(target, result); + } + + private static ScriptureRef[] ScrRef(params string[] refs) + { + return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); + } + + private static WordAlignmentMatrix ToWordAlignmentMatrix(string alignment) + { + IReadOnlyList wordPairs = AlignedWordPair.Parse(alignment).ToList(); + int rowCount = 0; + int columnCount = 0; + foreach (AlignedWordPair pair in wordPairs) + { + if (pair.SourceIndex + 1 > rowCount) + rowCount = pair.SourceIndex + 1; + if (pair.TargetIndex + 1 > columnCount) + columnCount = pair.TargetIndex + 1; + } + return new WordAlignmentMatrix(rowCount, columnCount, wordPairs.Select(wp => (wp.SourceIndex, wp.TargetIndex))); + } + + private static string UpdateUsfm( + IReadOnlyList<(IReadOnlyList, string)> rows, + string source, + string? idText = null, + UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, + UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, + UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, + UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, + IEnumerable? preserveParagraphStyles = null, + IEnumerable? usfmUpdateBlockHandlers = null + ) + { + source = source.Trim().ReplaceLineEndings("\r\n") + "\r\n"; + var updater = new UpdateUsfmParserHandler( + rows, + idText, + textBehavior, + paragraphBehavior, + embedBehavior, + styleBehavior, + preserveParagraphStyles, + usfmUpdateBlockHandlers + ); + UsfmParser.Parse(source, updater); + return updater.GetUsfm(); + } + + private static void AssertUsfmEquals(string target, string truth) + { + Assert.That(target, Is.Not.Null); + var target_lines = target.Split(["\n"], StringSplitOptions.None); + var truth_lines = truth.Split(["\n"], StringSplitOptions.None); + for (int i = 0; i < truth_lines.Length; i++) + { + Assert.That(target_lines[i].Trim(), Is.EqualTo(truth_lines[i].Trim()), message: $"Line {i}"); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 2b24b167..9430621b 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -91,7 +91,9 @@ public void GetUsfm_StripAllText() \rem and this reference too \ip \v 1 Update 1 +\p \v 2 +\p \v 3 Update 3 \v 4 "; @@ -220,8 +222,7 @@ public void GetUsfm_ParagraphInVerse() \p paragraph not in a verse \v 1 Update 1 \s1 Section Header -\v 2 Verse 2 -\p inner verse paragraph +\v 2 Verse 2 inner verse paragraph "; AssertUsfmEquals(target, result); @@ -684,7 +685,8 @@ public void GetUsfm_StripParagraphs() \p This is a paragraph before any verses \p This is a second paragraph before any verses \v 1 Hello -\p World +\q1 World +\p \v 2 Hello \p World "; @@ -696,6 +698,7 @@ public void GetUsfm_StripParagraphs() \p This is a paragraph before any verses \p Update Paragraph \v 1 Update Verse 1 +\q1 \p \v 2 Hello \p World @@ -709,8 +712,8 @@ public void GetUsfm_StripParagraphs() \p This is a paragraph before any verses \p Update Paragraph \v 1 Update Verse 1 -\v 2 Hello -\p World +\p +\v 2 Hello World "; AssertUsfmEquals(target, resultS); } @@ -850,7 +853,6 @@ public void UpdateBlock_Verse_PreserveParas() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse 1 ", true), (UsfmUpdateBlockElementType.Paragraph, "\\p ", false), @@ -881,7 +883,6 @@ public void UpdateBlock_Verse_StripParas() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse 1 ", true), (UsfmUpdateBlockElementType.Paragraph, "\\p ", true), @@ -912,7 +913,6 @@ public void UpdateBlock_Verse_Range() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1", "MAT 1:2", "MAT 1:3"], - (UsfmUpdateBlockElementType.Other, "\\v 1-3 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse 1 through 3 ", true) ); @@ -941,7 +941,6 @@ public void UpdateBlock_Footnote_PreserveEmbeds() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse", true), (UsfmUpdateBlockElementType.Embed, "\\f \\fr 1.1 \\ft Some note \\f*", false), @@ -972,7 +971,6 @@ public void UpdateBlock_Footnote_StripEmbeds() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse", true), (UsfmUpdateBlockElementType.Embed, "\\f \\fr 1.1 \\ft Some note \\f*", true), @@ -1030,7 +1028,6 @@ public void UpdateBlock_Verse_PreserveStyles() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse ", true), (UsfmUpdateBlockElementType.Style, "\\bd ", false), @@ -1063,7 +1060,6 @@ public void UpdateBlock_Verse_StripStyles() AssertUpdateBlockEquals( usfmUpdateBlock, ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse ", true), (UsfmUpdateBlockElementType.Style, "\\bd ", true), @@ -1099,7 +1095,6 @@ public void UpdateBlock_Verse_SectionHeader() AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[2], ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "Verse 1 ", true), (UsfmUpdateBlockElementType.Paragraph, "\\s Section header ", false), @@ -1108,7 +1103,6 @@ public void UpdateBlock_Verse_SectionHeader() AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[3], ["MAT 1:2"], - (UsfmUpdateBlockElementType.Other, "\\v 2 ", false), (UsfmUpdateBlockElementType.Text, "Verse 2 ", false) ); } @@ -1138,7 +1132,6 @@ public void UpdateBlock_Verse_SectionHeaderInVerse() AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[2], ["MAT 1:1"], - (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "Beginning of verse ", true), (UsfmUpdateBlockElementType.Paragraph, "\\s Section header ", false), @@ -1147,6 +1140,90 @@ public void UpdateBlock_Verse_SectionHeaderInVerse() ); } + [Test] + public void UpdateBlock_NonVerse_ParagraphEndOfVerse() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = + @"\id MAT - Test +\c 1 +\p +\v 1 Verse 1 +\s Section header +"; + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(3)); + AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], ["MAT 1:0/1:p"]); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[1], + ["MAT 1:1/1:s"], + (UsfmUpdateBlockElementType.Text, "Section header ", false) + ); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[2], + ["MAT 1:1"], + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "Verse 1 ", true) + ); + } + + [Test] + public void GetUsfm_HeaderReferenceParagraphs() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:1"), "new verse 1"), + (ScrRef("MAT 1:2"), "new verse 2"), + (ScrRef("MAT 1:3"), "new verse 3"), + (ScrRef("MAT 2:1"), "new verse 1"), + (ScrRef("MAT 2:2"), "new verse 2") + }; + + var usfm = + @"\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\s1 header between verses +\p +\v 2 verse 2 +\s1 mid-verse header +\p more verse 2 +\v 3 verse 3 +\c 2 +\v 1 consecutive elements +\s1 header +\r reference +\p +\v 2 verse 2 +"; + + string target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); + var resultP = + @"\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\s1 header between verses +\p +\v 2 new verse 2 +\s1 mid-verse header +\p +\v 3 new verse 3 +\c 2 +\v 1 new verse 1 +\s1 header +\r reference +\p +\v 2 new verse 2 +"; + AssertUsfmEquals(target, resultP); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index 14288705..129003bc 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -173,7 +173,7 @@ public void GetRows_IncludeMarkers() Assert.That( rows[0].Text, Is.EqualTo( - "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote for v1.\\f*" + "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote for v1.\\f* \\li1" ) ); @@ -216,16 +216,19 @@ public void GetRows_IncludeMarkers() Assert.That(rows[11].IsRangeStart, Is.False); Assert.That(rows[12].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4b", corpus.Versification))); - Assert.That(rows[12].Text, Is.EqualTo("Chapter two, verse four.")); + Assert.That(rows[12].Text, Is.EqualTo("Chapter two, verse four. \\p")); Assert.That(rows[13].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:5", corpus.Versification))); Assert.That(rows[13].Text, Is.EqualTo("Chapter two, verse five \\rq (MAT 3:1)\\rq*.")); Assert.That(rows[14].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:6", corpus.Versification))); - Assert.That(rows[14].Text, Is.EqualTo("Chapter two, verse \\w six|strong=\"12345\" \\w*.")); + Assert.That(rows[14].Text, Is.EqualTo("Chapter two, verse \\w six|strong=\"12345\" \\w*. \\p")); + + Assert.That(rows[17].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:8", corpus.Versification))); + Assert.That(rows[17].Text, Is.EqualTo("This is a list: \\b \\tr \\tc1")); Assert.That(rows[18].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:9", corpus.Versification))); - Assert.That(rows[18].Text, Is.EqualTo("Chapter\\tcr2 2\\tc3 verse\\tcr4 9")); + Assert.That(rows[18].Text, Is.EqualTo("Chapter\\tcr2 2\\tc3 verse\\tcr4 9 \\tr \\tc1-2")); Assert.That(rows[19].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:10", corpus.Versification))); Assert.That(rows[19].Text, Is.EqualTo("\\tc3-4 Chapter 2 verse 10")); @@ -253,7 +256,7 @@ public void GetRows_IncludeMarkers_AllText() Assert.That( rows[8].Text, Is.EqualTo( - "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote for v1.\\f*" + "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote for v1.\\f* \\li1" ) ); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 028968b3..11f8e62b 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -230,6 +230,30 @@ public void GetRows_OptBreak_OutsideOfSegment() }); } + [Test] + public void GetRows_ParagraphBeforeNonVerseParagraph() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\c 1 +\p +\v 1 verse 1 +\b +\s1 header +\q1 +\v 2 verse 2 +", + includeAllText: true, + includeMarkers: true + ); + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.Select(tr => tr.Text))); + Assert.That(rows[1].Text, Is.EqualTo("verse 1 \\b \\q1")); + Assert.That(rows[2].Text, Is.EqualTo("header")); + }); + } + private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) { UsfmMemoryText text =