Skip to content

Commit

Permalink
Accuracy reports
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam committed May 1, 2024
1 parent 2669e4e commit 907df63
Show file tree
Hide file tree
Showing 325 changed files with 237,192 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ riderModule.iml
*.sln.DotSettings.User

nuget
accuracy-reports

# Don't check in gzipped files
/src/Lingua/LanguageModels/**/*.json.gz
6 changes: 6 additions & 0 deletions Lingua.sln
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Root", "Solution R
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Build", "build\Build\Build.csproj", "{E8C232A5-D100-47D5-A8CA-8E8927E52CD8}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lingua.AccuracyReport.Tests", "tests\Lingua.AccuracyReport.Tests\Lingua.AccuracyReport.Tests.csproj", "{5D43F878-754F-4AA4-B2E1-D833163240AE}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -43,5 +45,9 @@ Global
{E8C232A5-D100-47D5-A8CA-8E8927E52CD8}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E8C232A5-D100-47D5-A8CA-8E8927E52CD8}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E8C232A5-D100-47D5-A8CA-8E8927E52CD8}.Release|Any CPU.Build.0 = Release|Any CPU
{5D43F878-754F-4AA4-B2E1-D833163240AE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{5D43F878-754F-4AA4-B2E1-D833163240AE}.Debug|Any CPU.Build.0 = Debug|Any CPU
{5D43F878-754F-4AA4-B2E1-D833163240AE}.Release|Any CPU.ActiveCfg = Release|Any CPU
{5D43F878-754F-4AA4-B2E1-D833163240AE}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal
133 changes: 133 additions & 0 deletions accuracy-reports/lingua/Afrikaans.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
##### Afrikaans #####

Legend: 'low accuracy mode | high accuracy mode'

>>> Accuracy on average: 64.00% | 78.63%

>> Detection of 1000 single words (average length: 8 chars)
Accuracy: 37.00% | 58.30%
Erroneously classified as
Dutch: 12.80% | 14.10%
German: 2.70% | 2.30%
Latin: 2.60% | 2.00%
English: 2.10% | 1.90%
Danish: 2.10% | 1.90%
Bokmal: 2.70% | 1.60%
Welsh: 0.70% | 1.10%
Nynorsk: 1.50% | 1.00%
Swedish: 1.50% | 0.90%
Estonian: 1.20% | 0.90%
Lithuanian: 1.30% | 0.70%
Italian: 0.40% | 0.70%
Zulu: 1.20% | 0.70%
Tswana: 1.50% | 0.70%
French: 1.40% | 0.60%
Basque: 1.40% | 0.60%
Ganda: 1.10% | 0.60%
Oromo: 2.00% | 0.60%
Turkish: 0.50% | 0.60%
Sotho: 1.00% | 0.60%
Swahili: 0.60% | 0.50%
Portuguese: 0.90% | 0.50%
Romanian: 0.90% | 0.50%
Tsonga: 1.40% | 0.50%
Esperanto: 1.00% | 0.50%
Xhosa: 0.40% | 0.50%
Latvian: 1.30% | 0.40%
Finnish: 2.30% | 0.40%
Yoruba: 0.70% | 0.40%
Polish: 1.10% | 0.40%
Shona: 0.80% | 0.40%
Icelandic: 0.90% | 0.30%
Malay: 0.80% | 0.30%
Indonesian: 0.40% | 0.30%
Irish: 0.40% | 0.30%
Somali: 1.00% | 0.30%
Maori: 0.60% | 0.20%
Catalan: 0.40% | 0.20%
Tagalog: 1.30% | 0.10%
Slovak: 0.50% | 0.10%
Spanish: 0.70% | 0.10%
Bosnian: 0.10% | 0.10%
Hungarian: 0.60% | 0.10%
Croatian: 0.50% | 0.10%
Vietnamese: 0.40% | 0.10%
Azerbaijani: 0.40% | 0.00%
Czech: 0.40% | 0.00%
Albanian: 0.40% | 0.00%
Slovene: 0.10% | 0.00%


>> Detection of 1000 word pairs (average length: 15 chars)
Accuracy: 62.20% | 80.80%
Erroneously classified as
Dutch: 13.30% | 11.00%
English: 0.80% | 1.30%
German: 2.70% | 1.10%
Latin: 1.00% | 0.80%
Danish: 1.30% | 0.70%
Bokmal: 1.90% | 0.40%
Estonian: 1.40% | 0.30%
Sotho: 0.60% | 0.30%
Yoruba: 0.80% | 0.30%
Nynorsk: 0.40% | 0.30%
Swedish: 1.40% | 0.20%
Tsonga: 0.70% | 0.20%
Finnish: 1.70% | 0.20%
Ganda: 0.40% | 0.20%
Italian: 0.40% | 0.20%
Welsh: 0.70% | 0.20%
Oromo: 0.40% | 0.20%
Swahili: 0.00% | 0.10%
Tagalog: 0.40% | 0.10%
French: 0.50% | 0.10%
Hungarian: 0.20% | 0.10%
Portuguese: 0.80% | 0.10%
Malay: 0.20% | 0.10%
Turkish: 0.10% | 0.10%
Esperanto: 0.50% | 0.10%
Shona: 0.50% | 0.10%
Tswana: 0.20% | 0.10%
Catalan: 0.10% | 0.10%
Bosnian: 0.10% | 0.10%
Spanish: 0.10% | 0.10%
Lithuanian: 0.70% | 0.00%
Romanian: 0.40% | 0.00%
Xhosa: 0.20% | 0.00%
Maori: 0.30% | 0.00%
Basque: 0.40% | 0.00%
Indonesian: 0.30% | 0.00%
Somali: 0.20% | 0.00%
Azerbaijani: 0.20% | 0.00%
Czech: 0.10% | 0.00%
Albanian: 0.30% | 0.00%
Latvian: 0.40% | 0.00%
Polish: 0.40% | 0.00%
Slovak: 0.10% | 0.00%
Zulu: 0.10% | 0.00%
Slovene: 0.10% | 0.00%


>> Detection of 1000 sentences (average length: 101 chars)
Accuracy: 92.80% | 96.80%
Erroneously classified as
Dutch: 5.10% | 2.60%
German: 0.20% | 0.20%
Sotho: 0.00% | 0.10%
Latin: 0.10% | 0.10%
Danish: 0.00% | 0.10%
English: 0.20% | 0.10%
Welsh: 0.20% | 0.00%
Tswana: 0.10% | 0.00%
Estonian: 0.40% | 0.00%
Tsonga: 0.10% | 0.00%
Bokmal: 0.20% | 0.00%
Yoruba: 0.10% | 0.00%
Catalan: 0.10% | 0.00%
Ganda: 0.10% | 0.00%
Oromo: 0.10% | 0.00%
Finnish: 0.10% | 0.00%
Hungarian: 0.10% | 0.00%


>> Exact values: 64 37 62.2 92.80000000000001 78.63333333333334 58.3 80.80000000000001 96.8
1 change: 1 addition & 0 deletions src/Lingua/Lingua.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

<ItemGroup>
<InternalsVisibleTo Include="$(AssemblyName).Tests" Key="$(PublicKey)" />
<InternalsVisibleTo Include="$(AssemblyName).AccuracyReport.Tests" Key="$(PublicKey)" />
</ItemGroup>

<!-- Gzip embedded resources to reduce size -->
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
using Lingua.Api;

namespace Lingua.AccuracyReport.Tests;

public abstract class AbstractLanguageDetectionAccuracyReport
{
private readonly LanguageDetectionStatistics _statistics;

public abstract void SingleWordsAreIdentifiedCorrectly(string singleWord);
public abstract void WordPairsAreIdentifiedCorrectly(string wordPair);
public abstract void EntireSentencesAreIdentifiedCorrectly(string wordPair);

protected AbstractLanguageDetectionAccuracyReport(
Language language,
Implementation implementation,
LanguageDetectionStatistics statistics)
{
_statistics = statistics;
_statistics.Language = language;
_statistics.Implementation = implementation;
}

protected void ComputeSingleWordStatistics(string singleWord) =>
_statistics.ComputeSingleWordStatistics(singleWord);

protected void ComputeWordPairStatistics(string wordPair) =>
_statistics.ComputeWordPairStatistics(wordPair);

protected void ComputeSentenceStatistics(string sentence) =>
_statistics.ComputeSentenceStatistics(sentence);
}
6 changes: 6 additions & 0 deletions tests/Lingua.AccuracyReport.Tests/Implementation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace Lingua.AccuracyReport.Tests;

public enum Implementation
{
Lingua
}
Loading

0 comments on commit 907df63

Please sign in to comment.