From 91b5e20cae28b2fc077622dcf259dd3da1197b61 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Sun, 18 Feb 2024 07:32:19 -0800 Subject: [PATCH 1/3] Make user phrase scores fairer via rewriting This fixes #118. To avoid single-syllable user unigrams dominating the grid walk when there are competing multi-syllable unigrams, we assign a fairer score to such user unigrams instead of the default value of 0. --- src/Engine/McBopomofoLM.cpp | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index 4ade300..989d906 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -22,13 +22,15 @@ // OTHER DEALINGS IN THE SOFTWARE. #include "McBopomofoLM.h" + #include -#include #include #include #include #include +#include "gramambular2/reading_grid.h" + namespace McBopomofo { McBopomofoLM::McBopomofoLM() @@ -122,7 +124,30 @@ std::vector McBopomofoLM::getUnig allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues); } - allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end()); + // TODO(#118): Leaky abstraction. This relies on the impl. detail that we always use the default separator. + bool isKeyMultiSyllable = key.find(Formosa::Gramambular2::ReadingGrid::kDefaultSeparator) != std::string::npos; + if (isKeyMultiSyllable || allUnigrams.empty()) { + allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end()); + } else { + // Score rewrite. To ensure fairness, each user unigram is assigned a + // score that is slightly higher than its peer unigrams. + double topScore = std::numeric_limits::lowest(); + for (const auto& unigram : allUnigrams) { + if (unigram.score() > topScore) { + topScore = unigram.score(); + } + } + + constexpr double epsilon = 0.000000001; + topScore += epsilon; + + std::vector rewrittenUserUnigrams; + for (const auto& unigram : userUnigrams) { + rewrittenUserUnigrams.emplace_back(Formosa::Gramambular2::LanguageModel::Unigram(unigram.value(), topScore)); + } + allUnigrams.insert(allUnigrams.begin(), rewrittenUserUnigrams.begin(), rewrittenUserUnigrams.end()); + } + return allUnigrams; } @@ -191,7 +216,6 @@ std::string McBopomofoLM::convertMacro(const std::string& input) return input; } - std::vector McBopomofoLM::filterAndTransformUnigrams(const std::vector unigrams, const std::unordered_set& excludedValues, std::unordered_set& insertedValues) { std::vector results; @@ -237,5 +261,4 @@ bool McBopomofoLM::hasAssociatedPhrasesForKey(const std::string& key) return m_associatedPhrases.hasValuesForKey(key); } - } // namespace McBopomofo From 3f9392ca400391976570126a1b51ab2df73cbc85 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Sun, 18 Feb 2024 16:13:27 -0800 Subject: [PATCH 2/3] Update the comments --- src/Engine/McBopomofoLM.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index 989d906..cf97131 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -124,13 +124,14 @@ std::vector McBopomofoLM::getUnig allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues); } - // TODO(#118): Leaky abstraction. This relies on the impl. detail that we always use the default separator. + // This relies on the fact that we always use the default separator. bool isKeyMultiSyllable = key.find(Formosa::Gramambular2::ReadingGrid::kDefaultSeparator) != std::string::npos; if (isKeyMultiSyllable || allUnigrams.empty()) { allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end()); } else { // Score rewrite. To ensure fairness, each user unigram is assigned a - // score that is slightly higher than its peer unigrams. + // score that is slightly higher than the highest of the current ones + // in allUnigrams. double topScore = std::numeric_limits::lowest(); for (const auto& unigram : allUnigrams) { if (unigram.score() > topScore) { From 4d9db9ccf075487c86c15885f0086e056f8076d1 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Mon, 19 Feb 2024 09:51:09 -0800 Subject: [PATCH 3/3] Clarify the motivation for score rewrite --- src/Engine/McBopomofoLM.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index cf97131..9431049 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -126,12 +126,25 @@ std::vector McBopomofoLM::getUnig // This relies on the fact that we always use the default separator. bool isKeyMultiSyllable = key.find(Formosa::Gramambular2::ReadingGrid::kDefaultSeparator) != std::string::npos; + + // If key is multi-syllabic (for example, ㄉㄨㄥˋ-ㄈㄢˋ), we just + // insert all collected userUnigrams on top of the unigrams fetched from + // the database. If key is mono-syllabic (for example, ㄉㄨㄥˋ), then + // we'll have to rewrite the collected userUnigrams. + // + // This is because, by default, user unigrams have a score of 0, which + // guarantees that grid walks will choose them. This is problematic, + // however, when a single-syllabic user phrase is competing with other + // multisyllabic phrases that start with the same syllable. For example, + // if a user has 丼 for ㄉㄨㄥˋ, and because that unigram has a score + // of 0, no other phrases in the database that start with ㄉㄨㄥˋ would + // be able to compete with it. Without the rewrite, ㄉㄨㄥˋ-ㄗㄨㄛˋ + // would always result in "丼" + "作" instead of "動作" because the + // node for "丼" would dominate the walk. if (isKeyMultiSyllable || allUnigrams.empty()) { allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end()); - } else { - // Score rewrite. To ensure fairness, each user unigram is assigned a - // score that is slightly higher than the highest of the current ones - // in allUnigrams. + } else if (!userUnigrams.empty()) { + // Find the highest score from the existing allUnigrams. double topScore = std::numeric_limits::lowest(); for (const auto& unigram : allUnigrams) { if (unigram.score() > topScore) { @@ -139,12 +152,13 @@ std::vector McBopomofoLM::getUnig } } + // Boost by a very small number. This is the score for user phrases. constexpr double epsilon = 0.000000001; - topScore += epsilon; + double boostedScore = topScore + epsilon; std::vector rewrittenUserUnigrams; for (const auto& unigram : userUnigrams) { - rewrittenUserUnigrams.emplace_back(Formosa::Gramambular2::LanguageModel::Unigram(unigram.value(), topScore)); + rewrittenUserUnigrams.emplace_back(Formosa::Gramambular2::LanguageModel::Unigram(unigram.value(), boostedScore)); } allUnigrams.insert(allUnigrams.begin(), rewrittenUserUnigrams.begin(), rewrittenUserUnigrams.end()); }