Skip to content

Commit

Permalink
Address all warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam committed May 1, 2024
1 parent fb3597f commit 628db56
Show file tree
Hide file tree
Showing 18 changed files with 397 additions and 220 deletions.
4 changes: 2 additions & 2 deletions Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
<PackageReleaseNotes>https://github.com/russcam/lingua-dotnet/releases</PackageReleaseNotes>
<PackageTags>lingua, language detection</PackageTags>
<LangVersion>latest</LangVersion>
<!-- TODO: fix all warnings and enable -->
<TreatWarningsAsErrors>false</TreatWarningsAsErrors>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<IsPackable>False</IsPackable>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<MinVerMinimumMajorMinor>1.0</MinVerMinimumMajorMinor>
<Deterministic>true</Deterministic>
</PropertyGroup>

<PropertyGroup>
Expand Down
5 changes: 4 additions & 1 deletion build/UnicodeScriptGenerator/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ public enum UnicodeScript
Unknown,
}
public static class CharExtensions
/// <summary>
/// Extension method for <see cref=""char""/> to determine its <see cref=""UnicodeScript""/> property.
/// </summary>
public static class UnicodeScriptInfo
{
private static readonly int[] ScriptStarts =
{");
Expand Down
2 changes: 1 addition & 1 deletion src/Lingua/Api/CharExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

internal static class CharExtensions
{
private static readonly HashSet<Alphabet> AlphabetsWithLogograms = LanguageExtensions.LanguagesSupportingLogograms
private static readonly HashSet<Alphabet> AlphabetsWithLogograms = LanguageInfo.LanguagesSupportingLogograms
.SelectMany(l => l.Alphabets())
.ToHashSet();

Expand Down
25 changes: 25 additions & 0 deletions src/Lingua/Api/IO/FilesWriter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
namespace Lingua.Api.IO;

internal abstract class PathValidation
{
public static void CheckInputFilePath(string inputFilePath) {
if (!Path.IsPathRooted(inputFilePath))
throw new ArgumentException($"Input file path '{inputFilePath}' is not absolute");

if (!File.Exists(inputFilePath))
throw new DirectoryNotFoundException($"Input file '{inputFilePath}' does not exist");

var attributes = File.GetAttributes(inputFilePath);
if (!attributes.HasFlag(FileAttributes.Normal))
throw new ArgumentException($"Input file path '{inputFilePath}' does not represent a regular file");
}

public static void CheckOutputDirectoryPath(string outputDirectoryPath)
{
if (!Path.IsPathRooted(outputDirectoryPath))
throw new ArgumentException($"Output directory path '{outputDirectoryPath}' is not absolute");

if (!Directory.Exists(outputDirectoryPath))
throw new DirectoryNotFoundException($"Output directory '{outputDirectoryPath}' does not exist");
}
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
using System.Text;
using System.Text.Json;
using Lingua.Internal;
using Lingua.Internal.IO;
using static Lingua.Api.IO.PathValidation;

namespace Lingua.Api.IO;

public class LanguageModelFilesWriter : FilesWriter
/// <summary>
/// Creates language model files and writes them to a directory.
/// </summary>
public class LanguageModelWriter
{
/// <summary>
/// Creates language model files and writes them to a directory.
/// </summary>
/// <param name="inputFilePath">The path to a txt file used for language model creation.</param>
/// <param name="inputFileCharset">The encoding of <see cref="inputFilePath"/>. Defaults to <see cref="Encoding.UTF8"/></param>
/// <param name="encoding">The encoding of <paramref name="inputFilePath"/>. Defaults to <see cref="Encoding.UTF8"/></param>
/// <param name="outputDirectoryPath">The directory where the language model files are to be written.</param>
/// <param name="language">The language for which to create language models.</param>
/// <param name="charClass"> A regex character class as supported by <see cref="System.Text.RegularExpressions.Regex"/></param>
public void CreateAndWriteLanguageModelFiles(
string inputFilePath,
Encoding? inputFileCharset,
Encoding? encoding,
string outputDirectoryPath,
Language language,
string charClass = "\\p{L}"
Expand All @@ -26,42 +28,42 @@ public void CreateAndWriteLanguageModelFiles(
CheckInputFilePath(inputFilePath);
CheckOutputDirectoryPath(outputDirectoryPath);

inputFileCharset ??= Encoding.UTF8;
encoding ??= Encoding.UTF8;
var unigramModel = CreateLanguageModel(
inputFilePath,
inputFileCharset,
encoding,
language,
1,
charClass,
new Dictionary<Ngram, int>()
);
var bigramModel = CreateLanguageModel(
inputFilePath,
inputFileCharset,
encoding,
language,
2,
charClass,
unigramModel.AbsoluteFrequencies
);
var trigramModel = CreateLanguageModel(
inputFilePath,
inputFileCharset,
encoding,
language,
3,
charClass,
bigramModel.AbsoluteFrequencies
);
var quadrigramModel = CreateLanguageModel(
inputFilePath,
inputFileCharset,
encoding,
language,
4,
charClass,
trigramModel.AbsoluteFrequencies
);
var fivegramModel = CreateLanguageModel(
inputFilePath,
inputFileCharset,
encoding,
language,
5,
charClass,
Expand Down
Loading

0 comments on commit 628db56

Please sign in to comment.