From f5aebea259fb7cd8a268c1395ea8b47ece8b155c Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Wed, 1 May 2024 12:16:15 +1000 Subject: [PATCH] Gzip embedded resources Reduce assembly size from 200MB -> 83MB --- .gitignore | 5 ++- src/Lingua/Api/LanguageDetector.cs | 14 +++++-- src/Lingua/Lingua.csproj | 64 +++++++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 83f1ac0..5144bdc 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ riderModule.iml /_ReSharper.Caches/ .idea -*.sln.DotSettings.User \ No newline at end of file +*.sln.DotSettings.User + +# Don't check in gzipped files +/src/Lingua/LanguageModels/**/*.json.gz diff --git a/src/Lingua/Api/LanguageDetector.cs b/src/Lingua/Api/LanguageDetector.cs index b75303e..6088036 100644 --- a/src/Lingua/Api/LanguageDetector.cs +++ b/src/Lingua/Api/LanguageDetector.cs @@ -1,4 +1,5 @@ using System.Collections.Concurrent; +using System.IO.Compression; using System.Text.RegularExpressions; using Lingua.Internal; using static Lingua.Api.Language; @@ -399,11 +400,16 @@ private static Dictionary LoadLanguageModels(Dictionary LoadLanguageModel(Language language, int ngramLength) { - var file = $"Lingua.LanguageModels.{language.IsoCode6391().ToString().ToLowerInvariant()}.{Ngram.GetNgramNameByLength(ngramLength)}s.json"; + var isoCode = language.IsoCode6391().ToString().ToLowerInvariant(); + var nGramName = Ngram.GetNgramNameByLength(ngramLength); + var file = $"Lingua.LanguageModels.{isoCode}.{nGramName}s.json.gz"; using var stream = typeof(LanguageDetector).Assembly.GetManifestResourceStream(file); - return stream == null - ? new Dictionary() - : TrainingDataLanguageModel.FromJson(stream); + + if (stream is null) + return new Dictionary(); + + using var gzipStream = new GZipStream(stream, CompressionMode.Decompress); + return TrainingDataLanguageModel.FromJson(gzipStream); } internal HashSet FilterLanguagesByRules(List words) diff --git a/src/Lingua/Lingua.csproj b/src/Lingua/Lingua.csproj index fe321b0..a8e1a64 100644 --- a/src/Lingua/Lingua.csproj +++ b/src/Lingua/Lingua.csproj @@ -19,5 +19,67 @@ - + + + + true + + + + + + + + + + + + 0) + { + Result = new TaskItem[Files.Length]; + for (int i = 0; i < Files.Length; i++) + { + ITaskItem item = Files[i]; + string sourcePath = item.GetMetadata("FullPath"); + string sourceItemSpec = item.ItemSpec; + string destinationSuffix = ".gz"; + string destinationPath = sourcePath + destinationSuffix; + string destinationItemSpec = sourceItemSpec + destinationSuffix; + + //Log.LogMessage(MessageImportance.Normal, "EmbeddedResource Src : " + sourceItemSpec); + + using (var sourceStream = File.OpenRead(sourcePath)) + using (var destinationStream = File.OpenWrite(destinationPath)) + using (var destinationGZip = new GZipStream(destinationStream, CompressionLevel.Optimal)) + { + sourceStream.CopyTo(destinationGZip); + } + + var destinationItem = new TaskItem(destinationItemSpec); + //Log.LogMessage(MessageImportance.Normal, "EmbeddedResource GZip: " + destinationItem.ItemSpec); + + Result[i] = destinationItem; + } + } + ]]> + + + + + + + + + + + + + + +