From df04b4b80f004a2bc53393b5baaeb6e25c83427d Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sat, 15 Jun 2024 00:41:37 +0300 Subject: [PATCH 1/9] Add list of ignored names in transliteration --- .../ImproperTranslationAnalyzer.cs | 28 ++++++++++++++++++- Osmalyzer/Runner.cs | 2 +- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs index 8410d74..1ebed02 100644 --- a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs +++ b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs @@ -53,6 +53,7 @@ public override void Run(IReadOnlyList datas, Report report) knownLanguages.ToDictionary(kl => kl, _ => new LanguageAnalysisResults()); Dictionary ignoredLanguages = new Dictionary(); + List ignoredNames = new(); // Parse @@ -66,7 +67,10 @@ public override void Run(IReadOnlyList datas, Report report) // todo: how many do we not know? if (lvNameRaw == null) + { + ignoredNames.Add(name); continue; // we don't actually know how this name is constructed in Latvian + } // Other languages @@ -310,6 +314,27 @@ public override void Run(IReadOnlyList datas, Report report) ) ); } + + + report.AddGroup( + GenericReportGroup.OtherNames, + "Ignored names", + "List of items that were not checked, because their name was not recognized (for example streets that don't have recovnized translatable nomenclature)" + ); + + foreach (string n in ignoredNames.Distinct().Where(_ => !_.Contains("—"))) + { + report.AddEntry( + GenericReportGroup.OtherNames, + new IssueReportEntry( + "Name '" + n + "' was ignored. " + //number + " " + (number == 1 ? "element has" : "elements have") + " tag `name:" + language + "`", + //new SortEntryDesc(number) + ) + ); + } + + } @@ -553,7 +578,8 @@ private class LanguageAnalysisResults private enum GenericReportGroup { - OtherLanguages + OtherLanguages, + OtherNames // Individual langauges will go to their own group } } \ No newline at end of file diff --git a/Osmalyzer/Runner.cs b/Osmalyzer/Runner.cs index 3ac70b4..a793727 100644 --- a/Osmalyzer/Runner.cs +++ b/Osmalyzer/Runner.cs @@ -72,7 +72,7 @@ public static void Run() new UnknownParcelLockerAnalyzer(), new LatviaPostLockerAnalyzer(), // new LatviaPostMailBoxAnalyzer(), - // new ImproperTranslationAnalyzer(), + new ImproperTranslationAnalyzer(), // new LidlShopAnalyzer(), new UnisendParcelLockerAnalyzer(), // new SpellingAnalyzer() From fd60d597080aaf02fc50f932f92c7a77eb3d13f8 Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sat, 15 Jun 2024 01:43:45 +0300 Subject: [PATCH 2/9] Add more recognized highway types --- Osmalyzer/Analyzers/Helpers/FuzzyAddressMatcher.cs | 6 +++++- Osmalyzer/Runner.cs | 14 +++++++------- data/street name qualifiers.tsv | 5 ++++- data/street name suffixes.tsv | 5 ++++- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Osmalyzer/Analyzers/Helpers/FuzzyAddressMatcher.cs b/Osmalyzer/Analyzers/Helpers/FuzzyAddressMatcher.cs index f45bffa..56b75d0 100644 --- a/Osmalyzer/Analyzers/Helpers/FuzzyAddressMatcher.cs +++ b/Osmalyzer/Analyzers/Helpers/FuzzyAddressMatcher.cs @@ -5,6 +5,7 @@ namespace Osmalyzer; public static class FuzzyAddressMatcher { + // TODO: rewrite to use tsv file from data private static readonly string[] _suffixes = { "iela", @@ -18,7 +19,10 @@ public static class FuzzyAddressMatcher "apvedceļš", "laukums", "prospekts", - "pārvads" + "pārvads", + "līnija", + "šķērslīnija", + "krastmala", }; // Note: ImproperTranslationAnalyzer is doing Russian translations, so add value there if adding here diff --git a/Osmalyzer/Runner.cs b/Osmalyzer/Runner.cs index a793727..97386b0 100644 --- a/Osmalyzer/Runner.cs +++ b/Osmalyzer/Runner.cs @@ -65,16 +65,16 @@ public static void Run() // new WikidataSynchronicityAnalyzer(), -- disabled // new BarrierConnectionAnalyzer(), // new BottleDepositPointsAnalyzer(), - new VenipakParcelLockerAnalyzer(), - new OmnivaParcelLockerAnalyzer(), - new ItellaParcelLockerAnalyzer(), - new DPDParcelLockerAnalyzer(), - new UnknownParcelLockerAnalyzer(), - new LatviaPostLockerAnalyzer(), + // new VenipakParcelLockerAnalyzer(), + // new OmnivaParcelLockerAnalyzer(), + // new ItellaParcelLockerAnalyzer(), + // new DPDParcelLockerAnalyzer(), + // new UnknownParcelLockerAnalyzer(), + // new LatviaPostLockerAnalyzer(), // new LatviaPostMailBoxAnalyzer(), new ImproperTranslationAnalyzer(), // new LidlShopAnalyzer(), - new UnisendParcelLockerAnalyzer(), + // new UnisendParcelLockerAnalyzer(), // new SpellingAnalyzer() }; #endif diff --git a/data/street name qualifiers.tsv b/data/street name qualifiers.tsv index ceb20db..b563c82 100644 --- a/data/street name qualifiers.tsv +++ b/data/street name qualifiers.tsv @@ -11,4 +11,7 @@ aleja аллея alley apvedceļš окружная дорога bypass laukums площадь square prospekts проспект avenue -pārvads переезд crossing \ No newline at end of file +pārvads переезд crossing +līnija линия line +šķērslīnija поперечная линия cross line +krastmala набережная waterfront \ No newline at end of file diff --git a/data/street name suffixes.tsv b/data/street name suffixes.tsv index b02c5ea..469d769 100644 --- a/data/street name suffixes.tsv +++ b/data/street name suffixes.tsv @@ -13,4 +13,7 @@ aleja līnija šoseja aplis -celiņš \ No newline at end of file +celiņš +līnija +šķērslīnija +krastmala \ No newline at end of file From 088fe1364edf9164e2dad3e66a773ac6aeef1cc0 Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sat, 15 Jun 2024 01:44:18 +0300 Subject: [PATCH 3/9] Fix translit for numbered objects --- .../ImproperTranslationAnalyzer.cs | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs index 1ebed02..b9b9999 100644 --- a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs +++ b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Text.RegularExpressions; using F23.StringSimilarity; namespace Osmalyzer; @@ -118,11 +119,21 @@ public override void Run(IReadOnlyList datas, Report report) List expectedNames = new List(); + string translit = Transliterator.TransliterateFromLvToRu(lvNameRaw); + foreach (string expectedPrefix in expectedRuPrefixes) { - string translit = Transliterator.TransliterateFromLvToRu(lvNameRaw); - expectedNames.Add(expectedPrefix + " " + translit); - expectedNames.Add(translit + " " + expectedPrefix); + if (Regex.Match(translit, @"\d\s*$").Success) + { + // For names like 'Imantas 1. līnija' -> 'Имантас 1-я линия' + expectedNames.Add(translit + "-я " + expectedPrefix); + expectedNames.Add(translit + "-й " + expectedPrefix); + } + else + { + expectedNames.Add(expectedPrefix + " " + translit); + expectedNames.Add(translit + " " + expectedPrefix); + } } // Match against current value @@ -171,10 +182,17 @@ public override void Run(IReadOnlyList datas, Report report) List expectedNames = new List(); + // Handle names like '12th street' and '2nd Line' + string translit = lvNameRaw; + translit = Regex.Replace(translit, @"(? Date: Sat, 15 Jun 2024 14:02:09 +0300 Subject: [PATCH 4/9] Extract checks into a function --- .../ImproperTranslationAnalyzer.cs | 122 ++++++++---------- 1 file changed, 53 insertions(+), 69 deletions(-) diff --git a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs index b9b9999..a0916ed 100644 --- a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs +++ b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs @@ -137,40 +137,7 @@ public override void Run(IReadOnlyList datas, Report report) } // Match against current value - - List matches = expectedNames.Select(en => MatchBetweenFuzzy(value, en, CyrillicNameMatcher.Instance)).ToList(); - - Match bestMatch = matches.OrderByDescending(m => m.Quality).First(); - - switch (bestMatch) - { - case ExactMatch: - if (languageResults.fullMatches.ContainsKey(value)) - languageResults.fullMatches[value]++; - else - languageResults.fullMatches[value] = 1; - break; - - case GoodEnoughMatch: - NonExactMatch? existing = languageResults.nonExactButGoodEnoughMatches - .FirstOrDefault(m => - m.Actual == value && - m.Expected == bestMatch.Expected && - m.Source == name); - - if (existing != null) - existing.Count++; - else - languageResults.nonExactButGoodEnoughMatches.Add(new NonExactMatch(value, bestMatch.Expected, name, 1)); - break; - - case NotAMatch: - issues.Add(new TranslitMismatchIssue("Russian", key, bestMatch.Expected, value, "name", name)); - break; - - default: - throw new NotImplementedException(); - } + checkTransliteration(value, expectedNames, name, languageResults, issues, knownLanguage, MatchBetweenFuzzyCyrillic); break; } @@ -195,41 +162,7 @@ public override void Run(IReadOnlyList datas, Report report) expectedNames.Add(translit + " " + expectedPrefix); } - // Match against current value - - List matches = expectedNames.Select(en => MatchBetweenExact(value, en)).ToList(); - - Match bestMatch = matches.OrderByDescending(m => m.Quality).First(); - - switch (bestMatch) - { - case ExactMatch: - if (languageResults.fullMatches.ContainsKey(value)) - languageResults.fullMatches[value]++; - else - languageResults.fullMatches[value] = 1; - break; - - case GoodEnoughMatch: - NonExactMatch? existing = languageResults.nonExactButGoodEnoughMatches - .FirstOrDefault(m => - m.Actual == value && - m.Expected == bestMatch.Expected && - m.Source == name); - - if (existing != null) - existing.Count++; - else - languageResults.nonExactButGoodEnoughMatches.Add(new NonExactMatch(value, bestMatch.Expected, name, 1)); - break; - - case NotAMatch: - issues.Add(new TranslitMismatchIssue("English", key, bestMatch.Expected, value, "name", name)); - break; - - default: - throw new NotImplementedException(); - } + checkTransliteration(value, expectedNames, name, languageResults, issues, knownLanguage, MatchBetweenExact); break; } @@ -355,6 +288,51 @@ public override void Run(IReadOnlyList datas, Report report) } + [Pure] + private static void checkTransliteration( + string value, + List expectedValues, + string originalName, + LanguageAnalysisResults languageResults, + List issues, + KnownLanguage knownLanguage, + Func matcher + ) + { + List matches = expectedValues.Select(ev => matcher(value, ev)).ToList(); + + Match bestMatch = matches.OrderByDescending(m => m.Quality).First(); + + switch (bestMatch) + { + case ExactMatch: + if (languageResults.fullMatches.ContainsKey(value)) + languageResults.fullMatches[value]++; + else + languageResults.fullMatches[value] = 1; + break; + + case GoodEnoughMatch: + NonExactMatch? existing = languageResults.nonExactButGoodEnoughMatches + .FirstOrDefault(m => + m.Actual == value && + m.Expected == bestMatch.Expected && + m.Source == originalName); + + if (existing != null) + existing.Count++; + else + languageResults.nonExactButGoodEnoughMatches.Add(new NonExactMatch(value, bestMatch.Expected, originalName, 1)); + break; + + case NotAMatch: + issues.Add(new TranslitMismatchIssue(knownLanguage.Name, knownLanguage.OsmSuffix, bestMatch.Expected, value, "name", originalName)); + break; + + default: + throw new NotImplementedException(); + } + } [Pure] private static string? ExtractRawLatvianName(string name, out string? suffix) @@ -365,6 +343,12 @@ public override void Run(IReadOnlyList datas, Report report) return null; } + [Pure] + private static Match MatchBetweenFuzzyCyrillic(string actual, string expectedOriginal) + { + return MatchBetweenFuzzy(actual, expectedOriginal, CyrillicNameMatcher.Instance); + } + [Pure] private static Match MatchBetweenFuzzy(string actual, string expectedOriginal, NameMatcher matcher) { From a838009bfc5cab71968efc41c32a55b274ce030a Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sat, 15 Jun 2024 14:10:41 +0300 Subject: [PATCH 5/9] Add `route=road` to transliteration check --- .../Misc Analyzers/ImproperTranslationAnalyzer.cs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs index a0916ed..6776046 100644 --- a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs +++ b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs @@ -34,9 +34,12 @@ public override void Run(IReadOnlyList datas, Report report) OsmMasterData osmMasterData = osmData.MasterData; - OsmDataExtract osmElements = osmMasterData.Filter( + OsmDataExtract osmHighwayElements = osmMasterData.Filter( new IsWay(), - new HasKey("highway"), + new OrMatch( + new HasKey("highway"), + new HasValue("route", "road") + ), new HasKey("name"), new HasKeyPrefixed("name:"), new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) @@ -58,7 +61,7 @@ public override void Run(IReadOnlyList datas, Report report) // Parse - foreach (OsmElement element in osmElements.Elements) + foreach (OsmElement element in osmHighwayElements.Elements) { string name = element.GetValue("name")!; From 5b1ad4aa5f091c2f02b3fe0414bbf57a24770018 Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sun, 16 Jun 2024 01:40:03 +0300 Subject: [PATCH 6/9] Extract en transliteration --- Osmalyzer/Misc/Transliterator.cs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Osmalyzer/Misc/Transliterator.cs b/Osmalyzer/Misc/Transliterator.cs index 1f3d98e..892b2fe 100644 --- a/Osmalyzer/Misc/Transliterator.cs +++ b/Osmalyzer/Misc/Transliterator.cs @@ -86,6 +86,17 @@ public static string TransliterateFromLvToRu(string name) } + [Pure] + public static string TransliterateFromLvToEn(string name) + { + string translit = name; + translit = Regex.Replace(translit, @"(? Date: Sun, 16 Jun 2024 01:42:06 +0300 Subject: [PATCH 7/9] Rework how transliteration is checked Code reorganized, Added LV to check Added `place!=city`, `boundary = administrative`, `railway = station` to checks --- .../ImproperTranslationAnalyzer.cs | 359 ++++++++++-------- 1 file changed, 205 insertions(+), 154 deletions(-) diff --git a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs index 6776046..0d1fc3b 100644 --- a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs +++ b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs @@ -23,10 +23,25 @@ public class ImproperTranslationAnalyzer : Analyzer typeof(OsmAnalysisData), typeof(StreetNameQualifiersAnalysisData) }; + + // These are the languages we check and know about + private List knownLanguages = new List() + { + new KnownLanguage("Russian", "ru"), + new KnownLanguage("English", "en"), + new KnownLanguage("Latvian", "lv") + }; public override void Run(IReadOnlyList datas, Report report) { + // Each language keeps its record of results + Dictionary results = + knownLanguages.ToDictionary(kl => kl, _ => new LanguageAnalysisResults()); + + Dictionary ignoredLanguages = new Dictionary(); + List ignoredNames = new List(); + // Load OSM data OsmAnalysisData osmData = datas.OfType().First(); @@ -44,160 +59,38 @@ public override void Run(IReadOnlyList datas, Report report) new HasKeyPrefixed("name:"), new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) ); + // place=*, boundary = administrative, railway = station - // These are the languages we check and know about - List knownLanguages = new List() - { - new KnownLanguage("Russian", "ru"), - new KnownLanguage("English", "en") - }; + OsmDataExtract osmPlaceElements = osmMasterData.Filter( + new HasKey("place"), + new DoesntHaveAnyValue("place", "city"), + new HasKey("name"), + new HasKeyPrefixed("name:"), + new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) + ); + OsmDataExtract osmAdminBoundariesElements = osmMasterData.Filter( + new IsWay(), + new HasValue("boundary", "administrative"), + new HasKey("name"), + // filter out cross border objects + new CustomMatch(_ => _.HasKey("name") && !Regex.Match(_.GetValue("name")!, @" [-—/] ").Success), + new HasKeyPrefixed("name:"), + new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) + ); + OsmDataExtract osmRwStationsElements = osmMasterData.Filter( + new HasValue("railway", "station"), + new HasKey("name"), + new HasKeyPrefixed("name:"), + new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) + ); - // Each language keeps its record of results - Dictionary results = - knownLanguages.ToDictionary(kl => kl, _ => new LanguageAnalysisResults()); - - Dictionary ignoredLanguages = new Dictionary(); - List ignoredNames = new(); // Parse - foreach (OsmElement element in osmHighwayElements.Elements) - { - string name = element.GetValue("name")!; - - // Main name - - string? lvNameRaw = ExtractRawLatvianName(name, out string? latvianNameSuffix); - - // todo: how many do we not know? - if (lvNameRaw == null) - { - ignoredNames.Add(name); - continue; // we don't actually know how this name is constructed in Latvian - } - - // Other languages - - List<(string, string)> nameXxs = element.GetPrefixedValues("name:")!; - - foreach ((string key, string value) in nameXxs) - { - if (value == name) // may not be great, but not an error - continue; - - List issues = new List(); - - string? language = ExtractLanguage(key); - - if (language == null) - { - // TODO: report bad language key - continue; - } - - if (language == "lv") - { - // todo: check if mismatch? - continue; // we assume we are by default in Latvian - } - - KnownLanguage? knownLanguage = knownLanguages.FirstOrDefault(kl => kl.OsmSuffix == language); - - if (knownLanguage != null) - { - // We know about this language, so we can check the name - - // Collect new results into the language-specific container - LanguageAnalysisResults languageResults = results[knownLanguage]; - - switch (language) - { - case "ru": - { - // Figure out how the street should look like in Russian transliteration - - List expectedRuPrefixes = nameQualifiersData.Names[latvianNameSuffix!][language]; - // It is acceptable for all object to be named as street (why?) - expectedRuPrefixes = expectedRuPrefixes.Union(nameQualifiersData.Names["iela"][language]).ToList(); - - List expectedNames = new List(); - - string translit = Transliterator.TransliterateFromLvToRu(lvNameRaw); - - foreach (string expectedPrefix in expectedRuPrefixes) - { - if (Regex.Match(translit, @"\d\s*$").Success) - { - // For names like 'Imantas 1. līnija' -> 'Имантас 1-я линия' - expectedNames.Add(translit + "-я " + expectedPrefix); - expectedNames.Add(translit + "-й " + expectedPrefix); - } - else - { - expectedNames.Add(expectedPrefix + " " + translit); - expectedNames.Add(translit + " " + expectedPrefix); - } - } - - // Match against current value - checkTransliteration(value, expectedNames, name, languageResults, issues, knownLanguage, MatchBetweenFuzzyCyrillic); - - break; - } - case "en": - { - List expectedEnPrefixes = nameQualifiersData.Names[latvianNameSuffix!][language]; - // It is acceptable for all object to be named as street (why?) - expectedEnPrefixes = expectedEnPrefixes.Union(nameQualifiersData.Names["iela"][language]).ToList(); - - List expectedNames = new List(); - - // Handle names like '12th street' and '2nd Line' - string translit = lvNameRaw; - translit = Regex.Replace(translit, @"(? 0) - { - // Any element(s) with this exact issue already? - ProblemFeature? existing = languageResults.problemFeatures.FirstOrDefault(f => f.IssuesMatch(issues)); - - // todo: dont add by distance too close - - if (existing != null) - existing.Elements.Add(element); // issues are the same already - else - languageResults.problemFeatures.Add(new ProblemFeature(new List() { element }, issues)); - } - } - else - { - if (ignoredLanguages.ContainsKey(language)) - ignoredLanguages[language]++; - else - ignoredLanguages.Add(language, 1); - } - } - } + method(osmHighwayElements.Elements, true, results, nameQualifiersData, ignoredNames, ignoredLanguages); + method(osmPlaceElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); + method(osmAdminBoundariesElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); + method(osmRwStationsElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); // Report checked languages @@ -291,6 +184,156 @@ public override void Run(IReadOnlyList datas, Report report) } + private void method( + IReadOnlyList elements, + bool nomenclatureRequired, + Dictionary results, + StreetNameQualifiersAnalysisData nameQualifiersData, + List ignoredNames, + Dictionary ignoredLanguages + ) + { + foreach (OsmElement element in elements) + { + string name = element.GetValue("name")!; + + // Main name + bool isSuffixFound = ExtractNomenclature(name, nameQualifiersData.Names.Keys.ToList(), out string lvNameRaw, out string? latvianNameSuffix); + + if (nomenclatureRequired && !isSuffixFound) + { + ignoredNames.Add(name); + // we don't actually know how this name is constructed in Latvian + continue; + } + + // Other languages + + List<(string, string)> nameXxs = element.GetPrefixedValues("name:")!; + + foreach ((string key, string value) in nameXxs) + { + List issues = new List(); + + string? language = ExtractLanguage(key); + + if (language == null) + { + // TODO: report bad language key + continue; + } + + KnownLanguage? knownLanguage = knownLanguages.FirstOrDefault(kl => kl.OsmSuffix == language); + + if (knownLanguage != null) + { + // We know about this language, so we can check the name + + // Collect new results into the language-specific container + LanguageAnalysisResults languageResults = results[knownLanguage]; + + switch (language) + { + case "lv": + { + // Expect exactly the same values as in name + List expectedNames = new List {name}; + checkTransliteration(value, expectedNames, name, languageResults, issues, knownLanguage, MatchBetweenExact); + break; + } + case "ru": + { + // Figure out how the street should look like in Russian transliteration + + List expectedNames = new List(); + + string translit = Transliterator.TransliterateFromLvToRu(lvNameRaw); + + if (latvianNameSuffix != null) + { + List expectedRuPrefixes = nameQualifiersData.Names[latvianNameSuffix!][language]; + + foreach (string expectedPrefix in expectedRuPrefixes) + { + if (Regex.Match(translit, @"\d\s*$").Success) + { + // For names like 'Imantas 1. līnija' -> 'Имантас 1-я линия' + expectedNames.Add(translit + "-я " + expectedPrefix); + expectedNames.Add(translit + "-й " + expectedPrefix); + } + else + { + expectedNames.Add(expectedPrefix + " " + translit); + expectedNames.Add(translit + " " + expectedPrefix); + } + } + } + else + { + expectedNames.Add(translit); + } + + // Match against current value + checkTransliteration(value, expectedNames, name, languageResults, issues, knownLanguage, MatchBetweenFuzzyCyrillic); + + break; + } + case "en": + { + // Handle names like '12th street' and '2nd Line' + string translit = Transliterator.TransliterateFromLvToEn(lvNameRaw); + + List expectedNames = new List(); + + if (latvianNameSuffix != null) + { + List expectedEnPrefixes = nameQualifiersData.Names[latvianNameSuffix!][language]; + + // Expect exact name with only translation for the nomenclature + foreach (string expectedPrefix in expectedEnPrefixes) + { + expectedNames.Add(translit + " " + expectedPrefix); + } + } + else + { + expectedNames.Add(translit); + } + checkTransliteration(value, expectedNames, name, languageResults, issues, knownLanguage, MatchBetweenExact); + + break; + } + + default: + throw new NotImplementedException(); + } + + // Did we find any issues for this element? + + if (issues.Count > 0) + { + // Any element(s) with this exact issue already? + ProblemFeature? existing = languageResults.problemFeatures.FirstOrDefault(f => f.IssuesMatch(issues)); + + // todo: dont add by distance too close + + if (existing != null) + existing.Elements.Add(element); // issues are the same already + else + languageResults.problemFeatures.Add(new ProblemFeature(new List() { element }, issues)); + } + } + else + { + if (ignoredLanguages.ContainsKey(language)) + ignoredLanguages[language]++; + else + ignoredLanguages.Add(language, 1); + } + } + } + } + [Pure] private static void checkTransliteration( string value, @@ -338,12 +381,20 @@ Func matcher } [Pure] - private static string? ExtractRawLatvianName(string name, out string? suffix) + private static bool ExtractNomenclature(string name, List nomenclature, out string rawName, out string? nomenclatureName) { - if (FuzzyAddressMatcher.EndsWithStreetNameSuffix(name, out suffix)) - return name[..^(suffix!.Length + 1)]; // also grab the implied space - - return null; + foreach (string s in nomenclature) + { + if (name.EndsWith(" " + s)) + { + nomenclatureName = s; + rawName = name[..^(s!.Length)].Trim(); + return true; + } + } + rawName = name; + nomenclatureName = null; + return false; } [Pure] From 5f3c0651a6908bff354f9d5d60a95a91f2064bf0 Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sun, 16 Jun 2024 01:42:25 +0300 Subject: [PATCH 8/9] Minor translit fixes --- Osmalyzer/Misc/Transliterator.cs | 3 +++ data/street name qualifiers.tsv | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Osmalyzer/Misc/Transliterator.cs b/Osmalyzer/Misc/Transliterator.cs index 892b2fe..b4108ba 100644 --- a/Osmalyzer/Misc/Transliterator.cs +++ b/Osmalyzer/Misc/Transliterator.cs @@ -82,6 +82,9 @@ public static string TransliterateFromLvToRu(string name) translit += newC; } + // Post processing + translit = translit.Replace("ьйо","ё"); + return translit; } diff --git a/data/street name qualifiers.tsv b/data/street name qualifiers.tsv index b563c82..9d65947 100644 --- a/data/street name qualifiers.tsv +++ b/data/street name qualifiers.tsv @@ -14,4 +14,5 @@ prospekts проспект avenue pārvads переезд crossing līnija линия line šķērslīnija поперечная линия cross line -krastmala набережная waterfront \ No newline at end of file +krastmala набережная waterfront +stacija станция station \ No newline at end of file From 004bdfd3c5dfb56e241bd014893561bb141a46cc Mon Sep 17 00:00:00 2001 From: Aliaksei Markouski Date: Sun, 16 Jun 2024 20:52:29 +0300 Subject: [PATCH 9/9] Removed `place!=city` from check - too much false positives --- Osmalyzer/Analyzers/Helpers/BoundaryHelper.cs | 8 ++++++ .../ImproperTranslationAnalyzer.cs | 27 ++++++++++--------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/Osmalyzer/Analyzers/Helpers/BoundaryHelper.cs b/Osmalyzer/Analyzers/Helpers/BoundaryHelper.cs index cdc849f..a7169cc 100644 --- a/Osmalyzer/Analyzers/Helpers/BoundaryHelper.cs +++ b/Osmalyzer/Analyzers/Helpers/BoundaryHelper.cs @@ -25,6 +25,14 @@ public static OsmPolygon GetRigaPolygon(OsmMasterData osmData) return _rigaPolygon; } + [Pure] + public static OsmPolygon GetDaugavpilsPolygon(OsmMasterData osmData) + { + if (_rigaPolygon == null) + _rigaPolygon = GetAdminRelationPolygon(osmData, "6", "Daugavpils"); + + return _rigaPolygon; + } [Pure] private static OsmPolygon GetAdminRelationPolygon(OsmMasterData osmData, string level, string name) diff --git a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs index 0d1fc3b..bdc14ea 100644 --- a/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs +++ b/Osmalyzer/Analyzers/Misc Analyzers/ImproperTranslationAnalyzer.cs @@ -61,13 +61,16 @@ public override void Run(IReadOnlyList datas, Report report) ); // place=*, boundary = administrative, railway = station - OsmDataExtract osmPlaceElements = osmMasterData.Filter( - new HasKey("place"), - new DoesntHaveAnyValue("place", "city"), - new HasKey("name"), - new HasKeyPrefixed("name:"), - new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) - ); + // Too much of (probably) false positives + // OsmDataExtract osmPlaceElements = osmMasterData.Filter( + // new HasKey("place"), + // new DoesntHaveAnyValue("place", "city"), + // new HasKey("name"), + // new HasKeyPrefixed("name:"), + // // Exclude Daugavpils for the time being + // //new CustomMatch(_ => !BoundaryHelper.GetDaugavpilsPolygon(osmData.MasterData).ContainsElement(_, OsmPolygon.RelationInclusionCheck.Fuzzy)), + // new InsidePolygon(BoundaryHelper.GetLatviaPolygon(osmData.MasterData), OsmPolygon.RelationInclusionCheck.Fuzzy) + // ); OsmDataExtract osmAdminBoundariesElements = osmMasterData.Filter( new IsWay(), new HasValue("boundary", "administrative"), @@ -87,10 +90,10 @@ public override void Run(IReadOnlyList datas, Report report) // Parse - method(osmHighwayElements.Elements, true, results, nameQualifiersData, ignoredNames, ignoredLanguages); - method(osmPlaceElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); - method(osmAdminBoundariesElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); - method(osmRwStationsElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); + checkElementsTranliteration(osmHighwayElements.Elements, true, results, nameQualifiersData, ignoredNames, ignoredLanguages); + // checkElementsTranliteration(osmPlaceElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); + checkElementsTranliteration(osmAdminBoundariesElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); + checkElementsTranliteration(osmRwStationsElements.Elements, false, results, nameQualifiersData, ignoredNames, ignoredLanguages); // Report checked languages @@ -184,7 +187,7 @@ public override void Run(IReadOnlyList datas, Report report) } - private void method( + private void checkElementsTranliteration( IReadOnlyList elements, bool nomenclatureRequired, Dictionary results,