Skip to content

Commit

Permalink
Merge pull request #119 from openvanilla/dev/fairer-user-phrase-scores
Browse files Browse the repository at this point in the history
Make user phrase scores fairer via rewriting
  • Loading branch information
zonble authored Feb 20, 2024
2 parents 12025ca + 4d9db9c commit 66749e8
Showing 1 changed file with 42 additions and 4 deletions.
46 changes: 42 additions & 4 deletions src/Engine/McBopomofoLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@
// OTHER DEALINGS IN THE SOFTWARE.

#include "McBopomofoLM.h"

#include <algorithm>
#include <iterator>
#include <limits>
#include <string>
#include <utility>
#include <vector>

#include "gramambular2/reading_grid.h"

namespace McBopomofo {

McBopomofoLM::McBopomofoLM()
Expand Down Expand Up @@ -122,7 +124,45 @@ std::vector<Formosa::Gramambular2::LanguageModel::Unigram> McBopomofoLM::getUnig
allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues);
}

allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end());
// This relies on the fact that we always use the default separator.
bool isKeyMultiSyllable = key.find(Formosa::Gramambular2::ReadingGrid::kDefaultSeparator) != std::string::npos;

// If key is multi-syllabic (for example, ㄉㄨㄥˋ-ㄈㄢˋ), we just
// insert all collected userUnigrams on top of the unigrams fetched from
// the database. If key is mono-syllabic (for example, ㄉㄨㄥˋ), then
// we'll have to rewrite the collected userUnigrams.
//
// This is because, by default, user unigrams have a score of 0, which
// guarantees that grid walks will choose them. This is problematic,
// however, when a single-syllabic user phrase is competing with other
// multisyllabic phrases that start with the same syllable. For example,
// if a user has 丼 for ㄉㄨㄥˋ, and because that unigram has a score
// of 0, no other phrases in the database that start with ㄉㄨㄥˋ would
// be able to compete with it. Without the rewrite, ㄉㄨㄥˋ-ㄗㄨㄛˋ
// would always result in "丼" + "作" instead of "動作" because the
// node for "丼" would dominate the walk.
if (isKeyMultiSyllable || allUnigrams.empty()) {
allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end());
} else if (!userUnigrams.empty()) {
// Find the highest score from the existing allUnigrams.
double topScore = std::numeric_limits<double>::lowest();
for (const auto& unigram : allUnigrams) {
if (unigram.score() > topScore) {
topScore = unigram.score();
}
}

// Boost by a very small number. This is the score for user phrases.
constexpr double epsilon = 0.000000001;
double boostedScore = topScore + epsilon;

std::vector<Formosa::Gramambular2::LanguageModel::Unigram> rewrittenUserUnigrams;
for (const auto& unigram : userUnigrams) {
rewrittenUserUnigrams.emplace_back(Formosa::Gramambular2::LanguageModel::Unigram(unigram.value(), boostedScore));
}
allUnigrams.insert(allUnigrams.begin(), rewrittenUserUnigrams.begin(), rewrittenUserUnigrams.end());
}

return allUnigrams;
}

Expand Down Expand Up @@ -191,7 +231,6 @@ std::string McBopomofoLM::convertMacro(const std::string& input)
return input;
}


std::vector<Formosa::Gramambular2::LanguageModel::Unigram> McBopomofoLM::filterAndTransformUnigrams(const std::vector<Formosa::Gramambular2::LanguageModel::Unigram> unigrams, const std::unordered_set<std::string>& excludedValues, std::unordered_set<std::string>& insertedValues)
{
std::vector<Formosa::Gramambular2::LanguageModel::Unigram> results;
Expand Down Expand Up @@ -237,5 +276,4 @@ bool McBopomofoLM::hasAssociatedPhrasesForKey(const std::string& key)
return m_associatedPhrases.hasValuesForKey(key);
}


} // namespace McBopomofo

0 comments on commit 66749e8

Please sign in to comment.