Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Machine/src/Serval.Machine.Shared/Models/Build.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ public record Build
public required BuildJobRunnerType BuildJobRunner { get; init; }
public required BuildStage Stage { get; init; }
public string? Options { get; set; }
public IReadOnlyDictionary<string, string> ExecutionData { get; init; } = new Dictionary<string, string>();
public IReadOnlyDictionary<string, object> ExecutionData { get; init; } = new Dictionary<string, object>();
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@
<PackageReference Include="Hangfire.Mongo" Version="1.11.6" />
<PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="8.0.8" />
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.8" />
<PackageReference Include="SIL.Machine" Version="3.7.7" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine\SIL.Machine.csproj')" />
<PackageReference Include="SIL.Machine.Morphology.HermitCrab" Version="3.7.7" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Morphology.HermitCrab\SIL.Machine.Morphology.HermitCrab.csproj')" />
<PackageReference Include="SIL.Machine.Translation.Thot" Version="3.7.7" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Translation.Thot\SIL.Machine.Translation.Thot.csproj')" />
<PackageReference Include="SIL.Machine" Version="3.7.8" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine\SIL.Machine.csproj')" />
<PackageReference Include="SIL.Machine.Morphology.HermitCrab" Version="3.7.8" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Morphology.HermitCrab\SIL.Machine.Morphology.HermitCrab.csproj')" />
<PackageReference Include="SIL.Machine.Translation.Thot" Version="3.7.8" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Translation.Thot\SIL.Machine.Translation.Thot.csproj')" />
<PackageReference Include="SIL.WritingSystems" Version="14.1.1" />
<PackageReference Include="System.Linq.Async" Version="6.0.1" />
<PackageReference Include="YamlDotNet" Version="11.2.1" />
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
namespace Serval.Machine.Shared.Services;

public enum Flores200Support
{
LanguageAndScript,
OnlyScript,
None
}

public interface ILanguageTagService
{
bool ConvertToFlores200Code(string languageTag, out string flores200Code);
Flores200Support ConvertToFlores200Code(string languageTag, out string flores200Code);
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Task InsertInferenceResultsAsync(
Task UpdateBuildExecutionDataAsync(
string engineId,
string buildId,
IReadOnlyDictionary<string, string> executionData,
IReadOnlyDictionary<string, object> executionData,
CancellationToken cancellationToken = default
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

public class LanguageTagService : ILanguageTagService
{
private readonly Dictionary<string, string> _flores200Languages = InitializeFlores200Languages();
private readonly HashSet<string> _flores200Languages = [];
private readonly HashSet<string> _flores200Scripts = [];
private readonly LanguageTagParser _parser = new();

private static Dictionary<string, string> InitializeFlores200Languages()
public LanguageTagService()
{
InitializeFlores200Languages();
}

private void InitializeFlores200Languages()
{
Dictionary<string, string> flores200Languages = [];
using var floresStream = Assembly
.GetExecutingAssembly()
.GetManifestResourceStream("Serval.Machine.Shared.data.flores200languages.csv");
Expand All @@ -21,23 +26,31 @@ private static Dictionary<string, string> InitializeFlores200Languages()
if (line is null)
continue;
string[] values = line.Split(',');
flores200Languages[values[1].Trim()] = values[0].Trim();
_flores200Languages.Add(values[1].Trim());
_flores200Scripts.Add(values[1].Trim().Split('_')[1]);
}
return flores200Languages;
}

/**
* Converts a language tag to a Flores 200 code
* @param {string} languageTag - The language tag to convert
* @param out {string} flores200Code - The converted Flores 200 code
* @returns {bool} is the language is the Flores 200 list
*/
public bool ConvertToFlores200Code(string languageTag, out string flores200Code)
/// <summary>
/// Converts a language tag to a Flores-200 code
/// </summary>
/// <param name="languageTag">The language tag to convert</param>
/// <param name="flores200Code">The converted Flores-200 code</param>
/// <returns> Is the language in the Flores-200 list and is the script in the Flores-200 list</returns>
public Flores200Support ConvertToFlores200Code(string languageTag, out string flores200Code)
{
if (_parser.TryParse(languageTag, out string? languageCode, out string? scriptCode))
flores200Code = $"{languageCode}_{scriptCode}";
else
flores200Code = languageTag;
return _flores200Languages.ContainsKey(flores200Code);
if (_flores200Scripts.Contains(scriptCode ?? ""))
{
if (_flores200Languages.Contains(flores200Code))
{
return Flores200Support.LanguageAndScript;
}
return Flores200Support.OnlyScript;
}
return Flores200Support.None;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ public int GetQueueSize()

public bool IsLanguageNativeToModel(string language, out string internalCode)
{
return _languageTagService.ConvertToFlores200Code(language, out internalCode);
return _languageTagService.ConvertToFlores200Code(language, out internalCode)
== Flores200Support.LanguageAndScript;
}

private async Task<string?> CancelBuildJobAsync(string engineId, CancellationToken cancellationToken)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
{
private readonly ILanguageTagService _languageTagService = languageTagService;

protected override bool ResolveLanguageCodeForBaseModel(string languageCode, out string resolvedCode)
private bool ResolveLanguageCode(string languageCode, out string resolvedCode)
{
return _languageTagService.ConvertToFlores200Code(languageCode, out resolvedCode);
return _languageTagService.ConvertToFlores200Code(languageCode, out resolvedCode)
== Flores200Support.LanguageAndScript;
}

protected override async Task UpdateParallelCorpusAnalysisAsync(
Expand Down Expand Up @@ -56,4 +57,97 @@ await PlatformService.UpdateParallelCorpusAnalysisAsync(
cancellationToken
);
}

protected override async Task UpdateBuildExecutionData(
string engineId,
string buildId,
int trainCount,
int pretranslateCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken
)
{
bool sourceLanguageHasNativeSupport = ResolveLanguageCode(sourceLanguageTag, out string resolvedSourceLanguage);
bool targetLanguageHasNativeSupport = ResolveLanguageCode(targetLanguageTag, out string resolvedTargetLanguage);

if (trainCount == 0 && (!sourceLanguageHasNativeSupport || !targetLanguageHasNativeSupport))
{
throw new InvalidOperationException(
$"At least one language code in build {buildId} is unknown to the base model, and the data specified for training was empty. Build canceled."
);
}

IReadOnlyList<string> warnings = GetWarnings(
trainCount,
pretranslateCount,
sourceLanguageTag,
targetLanguageTag,
corpora
);

// Log summary of build data
JsonObject buildPreprocessSummary =
new()
{
{ "Event", "BuildPreprocess" },
{ "EngineId", engineId },
{ "BuildId", buildId },
{ "NumTrainRows", trainCount },
{ "NumPretranslateRows", pretranslateCount },
{ "EngineSourceLanguageTag", sourceLanguageTag },
{ "EngineTargetLanguageTag", targetLanguageTag },
{ "SourceLanguageResolved", resolvedSourceLanguage },
{ "TargetLanguageResolved", resolvedTargetLanguage },
{ "Warnings", new JsonArray(warnings.Select(w => JsonValue.Create(w)).ToArray()) }
};
Logger.LogInformation("{summary}", buildPreprocessSummary.ToJsonString());
var executionData = new Dictionary<string, object>()
{
{ "trainCount", trainCount },
{ "pretranslateCount", pretranslateCount },
{ "warnings", warnings },
{ "engineSourceLanguageTag", sourceLanguageTag },
{ "engineTargetLanguageTag", targetLanguageTag },
{ "resolvedSourceLanguage", resolvedSourceLanguage },
{ "resolvedTargetLanguage", resolvedTargetLanguage },
};
await PlatformService.UpdateBuildExecutionDataAsync(engineId, buildId, executionData, cancellationToken);
}

protected override IReadOnlyList<string> GetWarnings(
int trainCount,
int inferenceCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora
)
{
List<string> warnings =
[
.. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, corpora)
];

// Has at least a Gospel of Mark amount of data and not the special case of no data which will be caught elsewhere
if (trainCount < 600 && trainCount != 0)
{
warnings.Add($"Only {trainCount} segments were selected for training.");
}

if (
_languageTagService.ConvertToFlores200Code(sourceLanguageTag, out string resolvedCode)
== Flores200Support.None
)
{
warnings.Add($"The script for the source language '{resolvedCode}' is not in Flores-200");
}

if (_languageTagService.ConvertToFlores200Code(targetLanguageTag, out resolvedCode) == Flores200Support.None)
{
warnings.Add($"The script for the target language '{resolvedCode}' is not in Flores-200");
}

return warnings;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ CancellationToken cancellationToken
if (engine is null)
throw new OperationCanceledException($"Engine {engineId} does not exist. Build canceled.");

bool sourceTagInBaseModel = ResolveLanguageCodeForBaseModel(engine.SourceLanguage, out string srcLang);
bool targetTagInBaseModel = ResolveLanguageCodeForBaseModel(engine.TargetLanguage, out string trgLang);

(int trainCount, int inferenceCount) = await WriteDataFilesAsync(
buildId,
data,
Expand All @@ -56,20 +53,14 @@ await UpdateBuildExecutionData(
buildId,
trainCount,
inferenceCount,
srcLang,
trgLang,
engine.SourceLanguage,
engine.TargetLanguage,
data,
cancellationToken
);

await UpdateParallelCorpusAnalysisAsync(engineId, buildId, data, cancellationToken);

if (trainCount == 0 && (!sourceTagInBaseModel || !targetTagInBaseModel))
{
throw new InvalidOperationException(
$"At least one language code in build {buildId} is unknown to the base model, and the data specified for training was empty. Build canceled."
);
}

if (inferenceCount == 0 && engine is TranslationEngine { IsModelPersisted: false })
{
throw new InvalidOperationException(
Expand Down Expand Up @@ -97,8 +88,9 @@ protected abstract Task UpdateBuildExecutionData(
string buildId,
int trainCount,
int inferenceCount,
string srcLang,
string trgLang,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken
);

Expand Down Expand Up @@ -136,9 +128,31 @@ JobCompletionStatus completionStatus
}
}

protected virtual bool ResolveLanguageCodeForBaseModel(string languageCode, out string resolvedCode)
protected virtual IReadOnlyList<string> GetWarnings(
int trainCount,
int inferenceCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora
)
{
resolvedCode = languageCode;
return true;
List<string> warnings = [];

foreach (ParallelCorpus parallelCorpus in corpora)
{
IReadOnlyList<(string MonolingualCorpusId, IReadOnlyList<UsfmVersificationError> errors)> errorsPerCorpus =
ParallelCorpusPreprocessingService.AnalyzeUsfmVersification(parallelCorpus);

foreach ((string monolingualCorpusId, IReadOnlyList<UsfmVersificationError> errors) in errorsPerCorpus)
{
foreach (UsfmVersificationError error in errors)
{
warnings.Add(
$"USFM does not match project versification for parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId}: Expected verse {error.ExpectedVerseRef}, Actual verse {error.ActualVerseRef}, Mismatch type {error.Type}"
);
}
}
}
return warnings;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,36 @@ await _outboxService.EnqueueMessageAsync(
public async Task UpdateBuildExecutionDataAsync(
string engineId,
string buildId,
IReadOnlyDictionary<string, string> executionData,
IReadOnlyDictionary<string, object> executionData,
CancellationToken cancellationToken = default
)
{
var request = new UpdateBuildExecutionDataRequest { EngineId = engineId, BuildId = buildId };
request.ExecutionData.Add((IDictionary<string, string>)executionData);
var request = new UpdateBuildExecutionDataRequest
{
EngineId = engineId,
BuildId = buildId,
ExecutionData = new Google.Protobuf.WellKnownTypes.Struct()
};
foreach (KeyValuePair<string, object> kvp in executionData)
{
var value = new Google.Protobuf.WellKnownTypes.Value();
if (kvp.Value is string stringValue)
{
value.StringValue = stringValue;
}
else if (kvp.Value is int numberValue)
{
value.NumberValue = numberValue;
}
else if (kvp.Value is List<string> listValue)
{
value.ListValue = new Google.Protobuf.WellKnownTypes.ListValue();
value.ListValue.Values.AddRange(
listValue.Select(s => new Google.Protobuf.WellKnownTypes.Value() { StringValue = s })
);
}
request.ExecutionData.Fields.Add(kvp.Key, value);
}
await _outboxService.EnqueueMessageAsync(
outboxId: ServalTranslationPlatformOutboxConstants.OutboxId,
method: ServalTranslationPlatformOutboxConstants.UpdateBuildExecutionData,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ await _outboxService.EnqueueMessageAsync(
public async Task UpdateBuildExecutionDataAsync(
string engineId,
string buildId,
IReadOnlyDictionary<string, string> executionData,
IReadOnlyDictionary<string, object> executionData,
CancellationToken cancellationToken = default
)
{
Expand Down
Loading
Loading