Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ await client.BuildStartedAsync(
List<InsertPretranslationsRequest> pretranslationsRequests = [];
await _parallelCorpusPreprocessingService.PreprocessAsync(
request.Corpora.Select(Map).ToList(),
row => Task.CompletedTask,
(_, _) => Task.CompletedTask,
(row, _, corpus) =>
{
string[] tokens = row.SourceSegment.Split();
Expand Down
2 changes: 1 addition & 1 deletion src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ await client.BuildStartedAsync(
List<InsertWordAlignmentsRequest> wordAlignmentsRequests = [];
await _parallelCorpusPreprocessingService.PreprocessAsync(
request.Corpora.Select(Map).ToList(),
row => Task.CompletedTask,
(_, _) => Task.CompletedTask,
(row, _, corpus) =>
{
wordAlignmentsRequests.Add(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,14 @@ CancellationToken cancellationToken
await DownloadDataAsync(buildId, corpusDir, cancellationToken);

// assemble corpus
ITextCorpus sourceCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.src.txt"));
ITextCorpus targetCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.trg.txt"));
ITextCorpus sourceCorpus = new TextFileTextCorpus(
Path.Combine(corpusDir, "train.src.txt"),
Path.Combine(corpusDir, "train.key-terms.src.txt")
);
ITextCorpus targetCorpus = new TextFileTextCorpus(
Path.Combine(corpusDir, "train.trg.txt"),
Path.Combine(corpusDir, "train.key-terms.trg.txt")
);
IParallelTextCorpus parallelCorpus = sourceCorpus.AlignRows(targetCorpus);

// train SMT model
Expand Down Expand Up @@ -106,6 +112,20 @@ private async Task DownloadDataAsync(string buildId, string corpusDir, Cancellat
);
await using FileStream tgtFileStream = File.Create(Path.Combine(corpusDir, "train.trg.txt"));
await tgtText.CopyToAsync(tgtFileStream, cancellationToken);

await using Stream srcKeyTermsText = await _sharedFileService.OpenReadAsync(
$"builds/{buildId}/train.key-terms.src.txt",
cancellationToken
);
await using FileStream srcKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.src.txt"));
await srcKeyTermsText.CopyToAsync(srcKeyTermsFileStream, cancellationToken);

await using Stream tgtKeyTermsText = await _sharedFileService.OpenReadAsync(
$"builds/{buildId}/train.key-terms.trg.txt",
cancellationToken
);
await using FileStream tgtKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.trg.txt"));
await tgtKeyTermsFileStream.CopyToAsync(tgtKeyTermsText, cancellationToken);
}

private async Task<(int TrainCorpusSize, double Confidence)> TrainAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,14 @@ CancellationToken cancellationToken
await DownloadDataAsync(buildId, corpusDir, cancellationToken);

// assemble corpus
ITextCorpus sourceCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.src.txt"));
ITextCorpus targetCorpus = new TextFileTextCorpus(Path.Combine(corpusDir, "train.trg.txt"));
ITextCorpus sourceCorpus = new TextFileTextCorpus(
Path.Combine(corpusDir, "train.src.txt"),
Path.Combine(corpusDir, "train.key-terms.src.txt")
);
ITextCorpus targetCorpus = new TextFileTextCorpus(
Path.Combine(corpusDir, "train.trg.txt"),
Path.Combine(corpusDir, "train.key-terms.trg.txt")
);
IParallelTextCorpus parallelCorpus = sourceCorpus.AlignRows(targetCorpus);

// train word alignment model
Expand Down Expand Up @@ -100,6 +106,20 @@ private async Task DownloadDataAsync(string buildId, string corpusDir, Cancellat
);
await using FileStream tgtFileStream = File.Create(Path.Combine(corpusDir, "train.trg.txt"));
await tgtText.CopyToAsync(tgtFileStream, cancellationToken);

await using Stream srcKeyTermsText = await _sharedFileService.OpenReadAsync(
$"builds/{buildId}/train.key-terms.src.txt",
cancellationToken
);
await using FileStream srcKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.src.txt"));
await srcKeyTermsText.CopyToAsync(srcKeyTermsFileStream, cancellationToken);

await using Stream tgtKeyTermsText = await _sharedFileService.OpenReadAsync(
$"builds/{buildId}/train.key-terms.trg.txt",
cancellationToken
);
await using FileStream tgtKeyTermsFileStream = File.Create(Path.Combine(corpusDir, "train.key-terms.trg.txt"));
await tgtKeyTermsFileStream.CopyToAsync(tgtKeyTermsText, cancellationToken);
}

private async Task<int> TrainAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ CancellationToken cancellationToken
await using StreamWriter targetTrainWriter =
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken));

await using StreamWriter sourceKeyTermsTrainWriter =
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.src.txt", cancellationToken));
await using StreamWriter targetKeyTermsTrainWriter =
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.txt", cancellationToken));

await using Stream pretranslateStream = await SharedFileService.OpenWriteAsync(
$"builds/{buildId}/pretranslate.src.json",
cancellationToken
Expand All @@ -46,12 +51,20 @@ CancellationToken cancellationToken
pretranslateWriter.WriteStartArray();
await ParallelCorpusPreprocessingService.PreprocessAsync(
corpora,
async row =>
async (row, trainingDataType) =>
{
if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0)
{
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
if (trainingDataType == TrainingDataType.KeyTerms)
{
await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n");
}
else
{
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
}
}
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ CancellationToken cancellationToken
await using StreamWriter targetTrainWriter =
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken));

await using StreamWriter sourceKeyTermsTrainWriter =
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.src.txt", cancellationToken));
await using StreamWriter targetKeyTermsTrainWriter =
new(await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.txt", cancellationToken));

await using Stream wordAlignmentStream = await SharedFileService.OpenWriteAsync(
$"builds/{buildId}/word_alignments.inputs.json",
cancellationToken
Expand All @@ -46,12 +51,21 @@ CancellationToken cancellationToken
wordAlignmentWriter.WriteStartArray();
await ParallelCorpusPreprocessingService.PreprocessAsync(
corpora,
async row =>
async (row, trainingDataType) =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
{
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
if (trainingDataType == TrainingDataType.KeyTerms)
{
await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n");
}
else
{
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
}

trainCount++;
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,10 @@ public async Task<string> GetTargetExtractAsync()
{
using StreamReader srcReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.src.txt"));
using StreamReader trgReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.trg.txt"));
using StreamReader srcTermReader =
new(await SharedFileService.OpenReadAsync("builds/build1/train.key-terms.src.txt"));
using StreamReader trgTermReader =
new(await SharedFileService.OpenReadAsync("builds/build1/train.key-terms.trg.txt"));
int src1Count = 0;
int src2Count = 0;
int trgCount = 0;
Expand All @@ -998,8 +1002,17 @@ public async Task<string> GetTargetExtractAsync()
else if (srcLine.Length == 0)
trgCount++;
else
termCount++;
throw new ArgumentException("Unexpected line in test output");
}

while (
(srcLine = await srcTermReader.ReadLineAsync()) is not null
&& (trgLine = await trgTermReader.ReadLineAsync()) is not null
)
{
termCount++;
}

return (src1Count, src2Count, trgCount, termCount);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace SIL.ServiceToolkit.Models;

public enum TrainingDataType
{
Text = 0,
KeyTerms = 1,
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ ParallelCorpus corpus

Task PreprocessAsync(
IReadOnlyList<ParallelCorpus> corpora,
Func<Row, Task> train,
Func<Row, TrainingDataType, Task> train,
Func<Row, bool, ParallelCorpus, Task> inference,
bool useKeyTerms = false,
HashSet<string>? ignoreUsfmMarkers = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ MonolingualCorpus monolingualCorpus in parallelCorpus.SourceCorpora.Concat(paral

public async Task PreprocessAsync(
IReadOnlyList<ParallelCorpus> corpora,
Func<Row, Task> train,
Func<Row, TrainingDataType, Task> train,
Func<Row, bool, ParallelCorpus, Task> inference,
bool useKeyTerms = false,
HashSet<string>? ignoreUsfmMarkers = null
Expand Down Expand Up @@ -128,7 +128,7 @@ public async Task PreprocessAsync(

foreach (Row row in CollapseRanges(trainingRows))
{
await train(row);
await train(row, TrainingDataType.Text);
if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
{
parallelTrainingDataPresent = true;
Expand Down Expand Up @@ -177,7 +177,7 @@ ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row =>
{
foreach (Row row in keyTermTrainingData)
{
await train(row);
await train(row, TrainingDataType.KeyTerms);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public async Task TestParallelCorpusPreprocessor_FileFormatText()
int inferenceCount = 0;
await env.Processor.PreprocessAsync(
corpora,
row =>
(row, _) =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
Expand Down Expand Up @@ -83,7 +83,7 @@ public async Task TestParallelCorpusPreprocessor_FileFormatParatext()
var inferenceRefs = new List<string>();
await env.Processor.PreprocessAsync(
corpora,
row =>
(row, _) =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
{
Expand Down
Loading