From 628db56a482f5fb0d7fe2fc155c355714c734321 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Wed, 1 May 2024 13:33:07 +1000 Subject: [PATCH] Address all warnings --- Directory.Build.props | 4 +- build/UnicodeScriptGenerator/Program.cs | 5 +- src/Lingua/Api/CharExtensions.cs | 2 +- src/Lingua/Api/IO/FilesWriter.cs | 25 +++ ...lFilesWriter.cs => LanguageModelWriter.cs} | 24 +-- src/Lingua/Api/Language.cs | 171 +++++++++++++++--- src/Lingua/Api/LanguageDetector.cs | 2 + src/Lingua/Api/LanguageDetectorBuilder.cs | 58 +++++- src/Lingua/{ => Internal}/Alphabet.cs | 46 +++-- src/Lingua/Internal/FractionConverter.cs | 14 ++ src/Lingua/Internal/IO/FilesWriter.cs | 33 ---- src/Lingua/Internal/Ngram.cs | 18 +- src/Lingua/Internal/SpanSplitter.cs | 85 +++++++++ src/Lingua/Internal/TestDataLanguageModel.cs | 2 +- .../Internal/TrainingDataLanguageModel.cs | 113 +----------- src/Lingua/Lingua.csproj | 4 +- src/Lingua/UnicodeScript.g.cs | 5 +- tests/Lingua.Tests/LanguageDetectorTests.cs | 6 +- 18 files changed, 397 insertions(+), 220 deletions(-) create mode 100644 src/Lingua/Api/IO/FilesWriter.cs rename src/Lingua/Api/IO/{LanguageModelFilesWriter.cs => LanguageModelWriter.cs} (87%) rename src/Lingua/{ => Internal}/Alphabet.cs (76%) create mode 100644 src/Lingua/Internal/FractionConverter.cs delete mode 100644 src/Lingua/Internal/IO/FilesWriter.cs create mode 100644 src/Lingua/Internal/SpanSplitter.cs diff --git a/Directory.Build.props b/Directory.Build.props index cdb7dac..2b2a8e4 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -10,12 +10,12 @@ https://github.com/russcam/lingua-dotnet/releases lingua, language detection latest - - false + true False enable enable 1.0 + true diff --git a/build/UnicodeScriptGenerator/Program.cs b/build/UnicodeScriptGenerator/Program.cs index 44a57ef..7fe89ba 100644 --- a/build/UnicodeScriptGenerator/Program.cs +++ b/build/UnicodeScriptGenerator/Program.cs @@ -52,7 +52,10 @@ public enum UnicodeScript Unknown, } -public static class CharExtensions +/// +/// Extension method for to determine its property. +/// +public static class UnicodeScriptInfo { private static readonly int[] ScriptStarts = {"); diff --git a/src/Lingua/Api/CharExtensions.cs b/src/Lingua/Api/CharExtensions.cs index 4145061..4dc3778 100644 --- a/src/Lingua/Api/CharExtensions.cs +++ b/src/Lingua/Api/CharExtensions.cs @@ -2,7 +2,7 @@ internal static class CharExtensions { - private static readonly HashSet AlphabetsWithLogograms = LanguageExtensions.LanguagesSupportingLogograms + private static readonly HashSet AlphabetsWithLogograms = LanguageInfo.LanguagesSupportingLogograms .SelectMany(l => l.Alphabets()) .ToHashSet(); diff --git a/src/Lingua/Api/IO/FilesWriter.cs b/src/Lingua/Api/IO/FilesWriter.cs new file mode 100644 index 0000000..bf72ecb --- /dev/null +++ b/src/Lingua/Api/IO/FilesWriter.cs @@ -0,0 +1,25 @@ +namespace Lingua.Api.IO; + +internal abstract class PathValidation +{ + public static void CheckInputFilePath(string inputFilePath) { + if (!Path.IsPathRooted(inputFilePath)) + throw new ArgumentException($"Input file path '{inputFilePath}' is not absolute"); + + if (!File.Exists(inputFilePath)) + throw new DirectoryNotFoundException($"Input file '{inputFilePath}' does not exist"); + + var attributes = File.GetAttributes(inputFilePath); + if (!attributes.HasFlag(FileAttributes.Normal)) + throw new ArgumentException($"Input file path '{inputFilePath}' does not represent a regular file"); + } + + public static void CheckOutputDirectoryPath(string outputDirectoryPath) + { + if (!Path.IsPathRooted(outputDirectoryPath)) + throw new ArgumentException($"Output directory path '{outputDirectoryPath}' is not absolute"); + + if (!Directory.Exists(outputDirectoryPath)) + throw new DirectoryNotFoundException($"Output directory '{outputDirectoryPath}' does not exist"); + } +} diff --git a/src/Lingua/Api/IO/LanguageModelFilesWriter.cs b/src/Lingua/Api/IO/LanguageModelWriter.cs similarity index 87% rename from src/Lingua/Api/IO/LanguageModelFilesWriter.cs rename to src/Lingua/Api/IO/LanguageModelWriter.cs index 7cc378c..9ad9fd9 100644 --- a/src/Lingua/Api/IO/LanguageModelFilesWriter.cs +++ b/src/Lingua/Api/IO/LanguageModelWriter.cs @@ -1,23 +1,25 @@ using System.Text; -using System.Text.Json; using Lingua.Internal; -using Lingua.Internal.IO; +using static Lingua.Api.IO.PathValidation; namespace Lingua.Api.IO; -public class LanguageModelFilesWriter : FilesWriter +/// +/// Creates language model files and writes them to a directory. +/// +public class LanguageModelWriter { /// /// Creates language model files and writes them to a directory. /// /// The path to a txt file used for language model creation. - /// The encoding of . Defaults to + /// The encoding of . Defaults to /// The directory where the language model files are to be written. /// The language for which to create language models. /// A regex character class as supported by public void CreateAndWriteLanguageModelFiles( string inputFilePath, - Encoding? inputFileCharset, + Encoding? encoding, string outputDirectoryPath, Language language, string charClass = "\\p{L}" @@ -26,10 +28,10 @@ public void CreateAndWriteLanguageModelFiles( CheckInputFilePath(inputFilePath); CheckOutputDirectoryPath(outputDirectoryPath); - inputFileCharset ??= Encoding.UTF8; + encoding ??= Encoding.UTF8; var unigramModel = CreateLanguageModel( inputFilePath, - inputFileCharset, + encoding, language, 1, charClass, @@ -37,7 +39,7 @@ public void CreateAndWriteLanguageModelFiles( ); var bigramModel = CreateLanguageModel( inputFilePath, - inputFileCharset, + encoding, language, 2, charClass, @@ -45,7 +47,7 @@ public void CreateAndWriteLanguageModelFiles( ); var trigramModel = CreateLanguageModel( inputFilePath, - inputFileCharset, + encoding, language, 3, charClass, @@ -53,7 +55,7 @@ public void CreateAndWriteLanguageModelFiles( ); var quadrigramModel = CreateLanguageModel( inputFilePath, - inputFileCharset, + encoding, language, 4, charClass, @@ -61,7 +63,7 @@ public void CreateAndWriteLanguageModelFiles( ); var fivegramModel = CreateLanguageModel( inputFilePath, - inputFileCharset, + encoding, language, 5, charClass, diff --git a/src/Lingua/Api/Language.cs b/src/Lingua/Api/Language.cs index f50e4da..dd823ce 100644 --- a/src/Lingua/Api/Language.cs +++ b/src/Lingua/Api/Language.cs @@ -7,89 +7,172 @@ namespace Lingua.Api; /// public enum Language { + /// The 'Afrikaans' language Afrikaans, + /// The 'Albanian' language Albanian, + /// The 'Amharic' language Amharic, + /// The 'Arabic' language Arabic, + /// The 'Armenian' language Armenian, + /// The 'Azerbaijani' language Azerbaijani, + /// The 'Basque' language Basque, + /// The 'Belarusian' language Belarusian, + /// The 'Bengali' language Bengali, + /// The 'Bokmal' language Bokmal, + /// The 'Bosnian' language Bosnian, + /// The 'Bulgarian' language Bulgarian, + /// The 'Catalan' language Catalan, + /// The 'Chinese' language Chinese, + /// The 'Croatian' language Croatian, + /// The 'Czech' language Czech, + /// The 'Danish' language Danish, + /// The 'Dutch' language Dutch, + /// The 'English' language English, + /// The 'Esperanto' language Esperanto, + /// The 'Estonian' language Estonian, + /// The 'Finnish' language Finnish, + /// The 'French' language French, + /// The 'Ganda' language Ganda, + /// The 'Georgian' language Georgian, + /// The 'German' language German, + /// The 'Greek' language Greek, + /// The 'Gujarati' language Gujarati, + /// The 'Hebrew' language Hebrew, + /// The 'Hindi' language Hindi, + /// The 'Hungarian' language Hungarian, + /// The 'Icelandic' language Icelandic, + /// The 'Indonesian' language Indonesian, + /// The 'Irish' language Irish, + /// The 'Italian' language Italian, + /// The 'Japanese' language Japanese, + /// The 'Kazakh' language Kazakh, + /// The 'Korean' language Korean, + /// The 'Latin' language Latin, + /// The 'Latvian' language Latvian, + /// The 'Lithuanian' language Lithuanian, + /// The 'Macedonian' language Macedonian, + /// The 'Malay' language Malay, + /// The 'Maori' language Maori, + /// The 'Marathi' language Marathi, + /// The 'Mongolian' language Mongolian, + /// The 'Nynorsk' language Nynorsk, + /// The 'Oromo' language Oromo, + /// The 'Persian' language Persian, + /// The 'Polish' language Polish, + /// The 'Portuguese' language Portuguese, + /// The 'Punjabi' language Punjabi, + /// The 'Romanian' language Romanian, + /// The 'Russian' language Russian, + /// The 'Serbian' language Serbian, + /// The 'Shona' language Shona, + /// The 'Sinhala' language Sinhala, + /// The 'Slovak' language Slovak, + /// The 'Slovene' language Slovene, + /// The 'Somali' language Somali, + /// The 'Sotho' language Sotho, + /// The 'Spanish' language Spanish, + /// The 'Swahili' language Swahili, + /// The 'Swedish' language Swedish, + /// The 'Tagalog' language Tagalog, + /// The 'Tamil' language Tamil, + /// The 'Telugu' language Telugu, + /// The 'Thai' language Thai, + /// The 'Tigrinya' language Tigrinya, + /// The 'Tsonga' language Tsonga, + /// The 'Tswana' language Tswana, + /// The 'Turkish' language Turkish, + /// The 'Ukrainian' language Ukrainian, + /// The 'Urdu' language Urdu, + /// The 'Vietnamese' language Vietnamese, + /// The 'Welsh' language Welsh, + /// The 'Xhosa' language Xhosa, + /// The 'Yoruba' language Yoruba, + /// The 'Zulu' language Zulu, + /// The imaginary 'Unknown' language Unknown, } -public static class LanguageExtensions +/// +/// Extension methods for . +/// +public static class LanguageInfo { private static readonly Dictionary LanguageProperties = new(Enum.GetValues(typeof(Language)).Length) @@ -191,13 +274,6 @@ public static class LanguageExtensions /// The ISO639-3 code public static IsoCode6393 IsoCode6393(this Language language) => LanguageProperties[language].IsoCode6393; - /// - /// Gets the alphabets for the language - /// - /// The language - /// The alphabets - public static IReadOnlySet Alphabets(this Language language) => LanguageProperties[language].Alphabets; - /// /// Gets the unique characters for the language, or null if there are no unique characters. /// @@ -205,45 +281,88 @@ public static class LanguageExtensions /// The unique characters for the language, or null if there are no unique characters. public static string? UniqueCharacters(this Language language) => LanguageProperties[language].UniqueCharacters; - public static IList All() => LanguageProperties.Keys - .Where(l => l is not Unknown).ToList(); + /// + /// Gets the list of languages supporting logograms + /// + public static readonly IReadOnlySet LanguagesSupportingLogograms = + new HashSet{ Chinese, Japanese, Korean }; - public static IList AllSpokenOnes() => LanguageProperties.Keys - .Where(l => l is not (Unknown or Latin)).ToList(); + /// + /// Gets a set of all built-in languages. + /// + /// A set of all built-in languages. + public static HashSet All() => LanguageProperties.Keys + .Where(l => l is not Unknown).ToHashSet(); - public static IList AllWithArabicScript() => LanguageProperties + /// + /// Gets a set of all built-in languages that are still spoken today. + /// + /// A set of all built-in languages that are still spoken today. + public static HashSet AllSpokenOnes() => LanguageProperties.Keys + .Where(l => l is not (Unknown or Latin)).ToHashSet(); + + /// + /// Gets a set of all built-in languages supporting the Arabic script. + /// + /// A set of all built-in languages supporting the Arabic script. + public static HashSet AllWithArabicScript() => LanguageProperties .Where(l => l.Value.Alphabets.Contains(Alphabet.Arabic)) .Select(l => l.Key) - .ToList(); + .ToHashSet(); - public static IList AllWithCyrillicScript() => LanguageProperties + /// + /// Gets a set of all built-in languages supporting the Cyrillic script. + /// + /// A set of all built-in languages supporting the Cyrillic script. + public static HashSet AllWithCyrillicScript() => LanguageProperties .Where(l => l.Value.Alphabets.Contains(Alphabet.Cyrillic)) .Select(l => l.Key) - .ToList(); + .ToHashSet(); - public static IList AllWithDevangariScript() => LanguageProperties + /// + /// Gets a set of all built-in languages supporting the Devangari script. + /// + /// A set of all built-in languages supporting the Devangari script. + public static HashSet AllWithDevangariScript() => LanguageProperties .Where(l => l.Value.Alphabets.Contains(Alphabet.Devanagari)) .Select(l => l.Key) - .ToList(); + .ToHashSet(); - public static IList AllWithEthiopicScript() => LanguageProperties + /// + /// Gets a set of all built-in languages supporting the Ethiopic script. + /// + /// A set of all built-in languages supporting the Ethiopic script. + public static HashSet AllWithEthiopicScript() => LanguageProperties .Where(l => l.Value.Alphabets.Contains(Alphabet.Ethiopic)) .Select(l => l.Key) - .ToList(); + .ToHashSet(); - public static IList AllWithLatinScript() => LanguageProperties + /// + /// Gets a set of all built-in languages supporting the Latin script. + /// + /// A set of all built-in languages supporting the Latin script. + public static HashSet AllWithLatinScript() => LanguageProperties .Where(l => l.Value.Alphabets.Contains(Alphabet.Latin)) .Select(l => l.Key) - .ToList(); + .ToHashSet(); + /// + /// Gets a language by its code. + /// + /// The ISO 939-1 code + /// The language identified by the given ISO 939-1 code. public static Language GetByIsoCode6391(IsoCode6391 isoCode6391) => - LanguageProperties.Single(l => l.Value.IsoCode6391 == isoCode6391).Key; + LanguageProperties.First(l => l.Value.IsoCode6391 == isoCode6391).Key; + /// + /// Gets a language by its code. + /// + /// The ISO 939-3 code + /// The language identified by the given ISO 939-3 code. public static Language GetByIsoCode6393(IsoCode6393 isoCode6393) => - LanguageProperties.Single(l => l.Value.IsoCode6393 == isoCode6393).Key; + LanguageProperties.First(l => l.Value.IsoCode6393 == isoCode6393).Key; - public static readonly IReadOnlySet LanguagesSupportingLogograms = - new HashSet{ Chinese, Japanese, Korean }; + internal static IReadOnlySet Alphabets(this Language language) => LanguageProperties[language].Alphabets; } internal readonly record struct LanguageProperties(IsoCode6391 IsoCode6391, IsoCode6393 IsoCode6393, HashSet Alphabets, string? UniqueCharacters = null) diff --git a/src/Lingua/Api/LanguageDetector.cs b/src/Lingua/Api/LanguageDetector.cs index 6088036..0447ce1 100644 --- a/src/Lingua/Api/LanguageDetector.cs +++ b/src/Lingua/Api/LanguageDetector.cs @@ -612,6 +612,7 @@ private bool Equals(LanguageDetector other) => && _minimumRelativeDistance.Equals(other._minimumRelativeDistance) && _isLowAccuracyModeEnabled == other._isLowAccuracyModeEnabled; + /// public override bool Equals(object? obj) { if (ReferenceEquals(null, obj)) return false; @@ -620,6 +621,7 @@ public override bool Equals(object? obj) return Equals((LanguageDetector)obj); } + /// public override int GetHashCode() { var hashCode = new HashCode(); diff --git a/src/Lingua/Api/LanguageDetectorBuilder.cs b/src/Lingua/Api/LanguageDetectorBuilder.cs index b971338..8101ee5 100644 --- a/src/Lingua/Api/LanguageDetectorBuilder.cs +++ b/src/Lingua/Api/LanguageDetectorBuilder.cs @@ -12,34 +12,76 @@ public class LanguageDetectorBuilder private LanguageDetectorBuilder(HashSet languages) => _languages = languages; + /// + /// Instantiates a new instance of using all built-in languages. + /// + /// A new instance of public static LanguageDetectorBuilder FromAllLanguages() => - new(LanguageExtensions.All().ToHashSet()); + new(LanguageInfo.All()); + /// + /// Instantiates a new instance of using all built-in languages + /// that are still spoken today. + /// + /// A new instance of public static LanguageDetectorBuilder FromAllSpokenLanguages() => - new(LanguageExtensions.AllSpokenOnes().ToHashSet()); + new(LanguageInfo.AllSpokenOnes()); + /// + /// Instantiates a new instance of using all built-in languages + /// supporting Arabic script. + /// + /// A new instance of public static LanguageDetectorBuilder FromAllLanguagesWithArabicScript() => - new(LanguageExtensions.AllWithArabicScript().ToHashSet()); + new(LanguageInfo.AllWithArabicScript()); + /// + /// Instantiates a new instance of using all built-in languages + /// supporting Cyrillic script. + /// + /// A new instance of public static LanguageDetectorBuilder FromAllLanguagesWithCyrillicScript() => - new(LanguageExtensions.AllWithCyrillicScript().ToHashSet()); + new(LanguageInfo.AllWithCyrillicScript()); + /// + /// Instantiates a new instance of using all built-in languages + /// supporting Devangari script. + /// + /// A new instance of public static LanguageDetectorBuilder FromAllLanguagesWithDevangariScript() => - new(LanguageExtensions.AllWithDevangariScript().ToHashSet()); + new(LanguageInfo.AllWithDevangariScript()); + /// + /// Instantiates a new instance of using all built-in languages + /// supporting Latin script. + /// + /// A new instance of public static LanguageDetectorBuilder FromAllLanguagesWithLatinScript() => - new(LanguageExtensions.AllWithLatinScript().ToHashSet()); + new(LanguageInfo.AllWithLatinScript()); + /// + /// Instantiates a new instance of using all built-in languages + /// except the given languages. + /// + /// The languages to exclude to build a language detector. + /// A new instance of + /// If there are less than 2 languages public static LanguageDetectorBuilder FromAllLanguagesExcept(params Language[] languages) { - var languagesToLoad = Enum.GetValues().ToHashSet(); - languagesToLoad.RemoveWhere(language => language == Language.Unknown || languages.Contains(language)); + var languagesToLoad = LanguageInfo.All(); + languagesToLoad.RemoveWhere(languages.Contains); if (languagesToLoad.Count < 2) throw new ArgumentException("LanguageDetector needs at least 2 languages to choose from"); return new LanguageDetectorBuilder(languagesToLoad); } + /// + /// Instantiates a new instance of using the given languages. + /// + /// The languages to use to build a language detector. + /// A new instance of + /// If there are less than 2 languages public static LanguageDetectorBuilder FromLanguages(params Language[] languages) { var languagesToLoad = languages.ToHashSet(); diff --git a/src/Lingua/Alphabet.cs b/src/Lingua/Internal/Alphabet.cs similarity index 76% rename from src/Lingua/Alphabet.cs rename to src/Lingua/Internal/Alphabet.cs index 21cf17b..8c6f1b9 100644 --- a/src/Lingua/Alphabet.cs +++ b/src/Lingua/Internal/Alphabet.cs @@ -2,32 +2,53 @@ namespace Lingua; -public enum Alphabet +internal enum Alphabet { + /// The 'Arabic' alphabet Arabic, + /// The 'Armenian' alphabet Armenian, + /// The 'Bengali' alphabet Bengali, + /// The 'Cyrillic' alphabet Cyrillic, + /// The 'Devanagari' alphabet Devanagari, + /// The 'Ethiopic' alphabet Ethiopic, + /// The 'Georgian' alphabet Georgian, + /// The 'Greek' alphabet Greek, + /// The 'Gujarati' alphabet Gujarati, + /// The 'Gurmukhi' alphabet Gurmukhi, + /// The 'Han' alphabet Han, + /// The 'Hangul' alphabet Hangul, + /// The 'Hebrew' alphabet Hebrew, + /// The 'Hiragana' alphabet Hiragana, + /// The 'Katakana' alphabet Katakana, + /// The 'Latin' alphabet Latin, + /// The 'Sinhala' alphabet Sinhala, + /// The 'Tamil' alphabet Tamil, + /// The 'Telugu' alphabet Telugu, + /// The 'Thai' alphabet Thai, + /// The imaginary 'None' alphabet None, } -public static class AlphabetExtensions +internal static class AlphabetExtensions { internal static readonly Alphabet[] Values = Enum.GetValues(); @@ -92,18 +113,6 @@ public static bool Matches(this Alphabet alphabet, string text) return text.All(ch => ch.GetScript() == unicodeScript); } - private static HashSet SupportedLanguages(this Alphabet alphabet) - { - var languages = new HashSet(); - foreach (var language in Enum.GetValues()) - { - if (language.Alphabets().Contains(alphabet)) - languages.Add(language); - } - - return languages; - } - /// /// Gets the alphabets that support exactly one language. /// @@ -121,9 +130,16 @@ public static IReadOnlyDictionary AllSupportingExactlyOneLan var supportedLanguages = alphabet.SupportedLanguages(); if (supportedLanguages.Count == 1) - alphabets[alphabet] = supportedLanguages.Single(); + alphabets[alphabet] = supportedLanguages.First(); } return alphabets; } + + private static HashSet SupportedLanguages(this Alphabet alphabet) + { + var languages = LanguageInfo.All(); + languages.RemoveWhere(language => !language.Alphabets().Contains(alphabet)); + return languages; + } } diff --git a/src/Lingua/Internal/FractionConverter.cs b/src/Lingua/Internal/FractionConverter.cs new file mode 100644 index 0000000..89f0ce9 --- /dev/null +++ b/src/Lingua/Internal/FractionConverter.cs @@ -0,0 +1,14 @@ +using System.Text.Json; +using System.Text.Json.Serialization; +using Fractions; + +namespace Lingua.Internal; + +internal class FractionConverter : JsonConverter +{ + public override Fraction Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) => + Fraction.FromString(reader.GetString()!); + + public override void Write(Utf8JsonWriter writer, Fraction value, JsonSerializerOptions options) => + writer.WriteStringValue(value.ToString()); +} diff --git a/src/Lingua/Internal/IO/FilesWriter.cs b/src/Lingua/Internal/IO/FilesWriter.cs deleted file mode 100644 index 6e1c385..0000000 --- a/src/Lingua/Internal/IO/FilesWriter.cs +++ /dev/null @@ -1,33 +0,0 @@ -namespace Lingua.Internal.IO; - -public abstract class FilesWriter -{ - protected void CheckInputFilePath(string inputFilePath) { - if (!Path.IsPathRooted(inputFilePath)) - { - throw new ArgumentException("Input file path '$inputFilePath' is not absolute"); - } - if (!File.Exists(inputFilePath)) - { - throw new DirectoryNotFoundException($"Input file '{inputFilePath}' does not exist"); - } - - var attributes = File.GetAttributes(inputFilePath); - if (!attributes.HasFlag(FileAttributes.Normal)) - { - throw new ArgumentException($"Input file path '{inputFilePath}' does not represent a regular file"); - } - } - - protected void CheckOutputDirectoryPath(string outputDirectoryPath) - { - if (!Path.IsPathRooted(outputDirectoryPath)) - { - throw new ArgumentException("Output directory path '$outputDirectoryPath' is not absolute"); - } - if (!Directory.Exists(outputDirectoryPath)) - { - throw new DirectoryNotFoundException("Output directory '$outputDirectoryPath' does not exist"); - } - } -} \ No newline at end of file diff --git a/src/Lingua/Internal/Ngram.cs b/src/Lingua/Internal/Ngram.cs index 1eed473..afa0af6 100644 --- a/src/Lingua/Internal/Ngram.cs +++ b/src/Lingua/Internal/Ngram.cs @@ -5,7 +5,7 @@ namespace Lingua.Internal; /// /// A connected string of N items from a sample of text. /// -public readonly struct Ngram : IEquatable +internal readonly struct Ngram : IEquatable { private readonly string _value; @@ -62,6 +62,10 @@ public Ngram Dec() => /// public override int GetHashCode() => _value.GetHashCode(); + /// + /// Enumerates this ngram, producing ngrams of lower order down to unigram. + /// + /// A new instance of . public IEnumerable RangeOfLowerOrderNGrams() => new NgramEnumerable(this, new Ngram(_value[0].ToString())); @@ -77,10 +81,17 @@ internal static string GetNgramNameByLength(int ngramLength) => }; } -public struct NgramEnumerable : IEnumerable +/// +internal readonly struct NgramEnumerable : IEnumerable { private readonly Ngram _start; + /// + /// Intializes a new instance of + /// + /// The start ngram + /// The end ngram + /// If the start ngram is not of higher order than end ngram public NgramEnumerable(Ngram start, Ngram endInclusive) { if (endInclusive.Length > start.Length) @@ -95,7 +106,8 @@ public NgramEnumerable(Ngram start, Ngram endInclusive) IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); } -public struct NgramEnumerator : IEnumerator +/// +internal struct NgramEnumerator : IEnumerator { private readonly Ngram _start; private Ngram? _current; diff --git a/src/Lingua/Internal/SpanSplitter.cs b/src/Lingua/Internal/SpanSplitter.cs new file mode 100644 index 0000000..5b89e14 --- /dev/null +++ b/src/Lingua/Internal/SpanSplitter.cs @@ -0,0 +1,85 @@ +using System.Runtime.CompilerServices; + +namespace Lingua.Internal; + +internal readonly ref struct SpanSplitter + where T : IEquatable +{ + private readonly ReadOnlySpan _source; + private readonly ReadOnlySpan _separator; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public SpanSplitter(ReadOnlySpan source, ReadOnlySpan separator) + { + if (separator.Length == 0) + throw new ArgumentException("Requires non-empty value", nameof(separator)); + + _source = source; + _separator = separator; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public SpanSplitEnumerator GetEnumerator() => new(_source, _separator); +} + +internal ref struct SpanSplitEnumerator + where T : IEquatable +{ + private int _nextStartIndex = 0; + private readonly ReadOnlySpan _separator; + private readonly ReadOnlySpan _source; + private SpanSplitValue _current; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public SpanSplitEnumerator(ReadOnlySpan source, ReadOnlySpan separator) + { + if (separator.Length == 0) + throw new ArgumentException("Requires non-empty value", nameof(separator)); + + _source = source; + _separator = separator; + } + + public bool MoveNext() + { + if (_nextStartIndex > _source.Length) + return false; + + var nextSource = _source[_nextStartIndex..]; + var foundIndex = nextSource.IndexOf(_separator); + var length = -1 < foundIndex + ? foundIndex + : nextSource.Length; + + _current = new SpanSplitValue { StartIndex = _nextStartIndex, Length = length, Source = _source, }; + _nextStartIndex += _separator.Length + _current.Length; + + return true; + } + + public SpanSplitValue Current + { + [MethodImpl( MethodImplOptions.AggressiveInlining )] + get => _current; + } + + public readonly ref struct SpanSplitValue + { + public int StartIndex { get; init; } + public int Length { get; init; } + public ReadOnlySpan Source { get; init; } + + public ReadOnlySpan AsSpan() => Source.Slice(StartIndex, Length); + + public static implicit operator ReadOnlySpan(SpanSplitValue value) + => value.AsSpan(); + } +} + +internal static class ExtensionMethods +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static SpanSplitter Split(this ReadOnlySpan source, ReadOnlySpan separator) + where T : IEquatable => + new(source, separator); +} diff --git a/src/Lingua/Internal/TestDataLanguageModel.cs b/src/Lingua/Internal/TestDataLanguageModel.cs index f8ac314..fd3d539 100644 --- a/src/Lingua/Internal/TestDataLanguageModel.cs +++ b/src/Lingua/Internal/TestDataLanguageModel.cs @@ -2,7 +2,7 @@ namespace Lingua.Internal; -public partial record struct TestDataLanguageModel(HashSet Ngrams) +internal readonly partial record struct TestDataLanguageModel(HashSet Ngrams) { [GeneratedRegex("^\\p{L}+$")] private static partial Regex LetterRegex(); diff --git a/src/Lingua/Internal/TrainingDataLanguageModel.cs b/src/Lingua/Internal/TrainingDataLanguageModel.cs index d2c6cae..c37d11e 100644 --- a/src/Lingua/Internal/TrainingDataLanguageModel.cs +++ b/src/Lingua/Internal/TrainingDataLanguageModel.cs @@ -1,13 +1,12 @@ using System.Runtime.CompilerServices; using System.Text.Json; -using System.Text.Json.Serialization; using System.Text.RegularExpressions; using Fractions; using Lingua.Api; namespace Lingua.Internal; -public class TrainingDataLanguageModel +internal class TrainingDataLanguageModel { private record JsonLanguageModel(Language language, Dictionary ngrams); @@ -172,113 +171,3 @@ private static Dictionary ComputeAbsoluteFrequencies(IEnumerable -{ - public override Fraction Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) => - Fraction.FromString(reader.GetString()!); - - public override void Write(Utf8JsonWriter writer, Fraction value, JsonSerializerOptions options) => - writer.WriteStringValue(value.ToString()); -} - -public readonly ref struct SpanSplitter - where T : IEquatable -{ - private readonly ReadOnlySpan _source; - private readonly ReadOnlySpan _separator; - - [MethodImpl( MethodImplOptions.AggressiveInlining )] - public SpanSplitter( ReadOnlySpan source, ReadOnlySpan separator ) - { - if( 0 == separator.Length ) - { - throw new ArgumentException( "Requires non-empty value", nameof( separator ) ); - } - - _source = source; - _separator = separator; - } - - [MethodImpl( MethodImplOptions.AggressiveInlining )] - public SpanSplitEnumerator GetEnumerator() => new( _source, _separator ); -} - -public ref struct SpanSplitEnumerator - where T : IEquatable -{ - private int _nextStartIndex = 0; - private readonly ReadOnlySpan _separator; - private readonly ReadOnlySpan _source; - private SpanSplitValue _current; - - [MethodImpl( MethodImplOptions.AggressiveInlining )] - public SpanSplitEnumerator( ReadOnlySpan source, ReadOnlySpan separator ) - { - _source = source; - _separator = separator; - - if( 0 == separator.Length ) - { - throw new ArgumentException( "Requires non-empty value", nameof( separator ) ); - } - } - - public bool MoveNext() - { - if( _nextStartIndex > _source.Length ) - { - return false; - } - - var nextSource = _source.Slice( _nextStartIndex ); - - var foundIndex = nextSource.IndexOf( _separator ); - - var length = -1 < foundIndex - ? foundIndex - : nextSource.Length; - - _current = new SpanSplitValue - { - StartIndex = _nextStartIndex, - Length = length, - Source = _source, - }; - - _nextStartIndex += _separator.Length + _current.Length; - - return true; - } - - public SpanSplitValue Current - { - [MethodImpl( MethodImplOptions.AggressiveInlining )] - get => _current; - } - - public readonly ref struct SpanSplitValue - { - public int StartIndex { get; init; } - public int Length { get; init; } - public ReadOnlySpan Source { get; init; } - - public ReadOnlySpan AsSpan() => Source.Slice( StartIndex, Length ); - - public static implicit operator ReadOnlySpan( SpanSplitValue value ) - => value.AsSpan(); - } -} - - -public static class ExtensionMethods -{ - [MethodImpl( MethodImplOptions.AggressiveInlining )] - public static SpanSplitter Split( this ReadOnlySpan source, ReadOnlySpan separator ) - where T : IEquatable => - new( source, separator ); - - [MethodImpl( MethodImplOptions.AggressiveInlining )] - public static SpanSplitter Split( this Span source, ReadOnlySpan separator ) - where T : IEquatable => - new( source, separator ); -} diff --git a/src/Lingua/Lingua.csproj b/src/Lingua/Lingua.csproj index 8ac82dd..b38bc40 100644 --- a/src/Lingua/Lingua.csproj +++ b/src/Lingua/Lingua.csproj @@ -21,9 +21,7 @@ - - true - + diff --git a/src/Lingua/UnicodeScript.g.cs b/src/Lingua/UnicodeScript.g.cs index 4328421..bf93e0e 100644 --- a/src/Lingua/UnicodeScript.g.cs +++ b/src/Lingua/UnicodeScript.g.cs @@ -336,7 +336,10 @@ public enum UnicodeScript Unknown, } -public static class CharExtensions +/// +/// Extension method for to determine its property. +/// +public static class UnicodeScriptInfo { private static readonly int[] ScriptStarts = { diff --git a/tests/Lingua.Tests/LanguageDetectorTests.cs b/tests/Lingua.Tests/LanguageDetectorTests.cs index d821d52..4a36a3c 100644 --- a/tests/Lingua.Tests/LanguageDetectorTests.cs +++ b/tests/Lingua.Tests/LanguageDetectorTests.cs @@ -107,7 +107,7 @@ public class LanguageDetectorTests : IDisposable false); private readonly LanguageDetector _detectorForAllLanguages = new( - LanguageExtensions.All().ToHashSet(), + LanguageInfo.All().ToHashSet(), 0, false, false); @@ -224,7 +224,7 @@ public static IEnumerable NgramProbabilityProvider() [Theory] [MemberData(nameof(NgramProbabilityProvider))] - public void SumOfNgramProbabilitiesComputedCorrectly(HashSet ngrams, float expectedSumOfProbabilities) => + internal void SumOfNgramProbabilitiesComputedCorrectly(HashSet ngrams, float expectedSumOfProbabilities) => LanguageDetector.ComputeSumOfNgramProbabilities(English, ngrams) .Should() .Be(expectedSumOfProbabilities); @@ -264,7 +264,7 @@ public static IEnumerable LanguageProbabilitiesProvider() [Theory] [MemberData(nameof(LanguageProbabilitiesProvider))] - public void LanguageProbabilitiesComputedCorrectly(TestDataLanguageModel model, + internal void LanguageProbabilitiesComputedCorrectly(TestDataLanguageModel model, Dictionary expectedProbabilities) => LanguageDetector.ComputeLanguageProbabilities(model, _detectorForEnglishAndGerman.Languages) .Should().BeEquivalentTo(expectedProbabilities);