From 19050b7b677c70d24bea75a109c577fd776d935c Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 6 Jul 2022 15:14:20 -0500 Subject: [PATCH] handle weights more correctly (closes #44) * don't lose weights when minimizing * don't lose weights while compiling * don't duplicate weights while compiling * version bump so lexd can depend on this --- configure.ac | 2 +- lttoolbox/compiler.cc | 23 +++++++---- lttoolbox/transducer.cc | 68 +++++++++++++++++++++++++++---- lttoolbox/transducer.h | 8 ++++ tests/data/more-entry-weights.dix | 48 ++++++++++++++++++++++ tests/lt_proc/__init__.py | 6 +++ 6 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 tests/data/more-entry-weights.dix diff --git a/configure.ac b/configure.ac index 1a329446..9a1da36b 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ AC_PREREQ(2.52) m4_define([PKG_VERSION_MAJOR], [3]) m4_define([PKG_VERSION_MINOR], [6]) -m4_define([PKG_VERSION_PATCH], [9]) +m4_define([PKG_VERSION_PATCH], [10]) AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox]) diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 71dacaed..0d7120ee 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -280,7 +280,7 @@ Compiler::matchTransduction(std::vector const &pi, if(pi.size() == 0 && pd.size() == 0) { - state = t.insertNewSingleTransduction(alphabet(0, 0), state, default_weight); + state = t.insertNewSingleTransduction(alphabet(0, 0), state, entry_weight); } else { @@ -318,19 +318,19 @@ Compiler::matchTransduction(std::vector const &pi, right++; } - double weight_value; + int new_state; + double weight_value = default_weight; if(left == limleft && right == limright) { weight_value = entry_weight; + new_state = t.insertNewSingleTransduction(tag, state, entry_weight); } else { - weight_value = default_weight; + new_state = t.insertSingleTransduction(tag, state, default_weight); } - int new_state = t.insertSingleTransduction(tag, state, weight_value); - if (is_separable) { // loop-back symbols for and if (tag == alphabet(0, any_tag) || tag == alphabet(0, any_char)) { @@ -705,12 +705,12 @@ Compiler::insertEntryTokens(std::vector const &elements) // suffix paradigm if(suffix_paradigms[current_section].find(elements[i].paradigmName()) != suffix_paradigms[current_section].end()) { - t.linkStates(e, suffix_paradigms[current_section][elements[i].paradigmName()], 0, elements[i].entryWeight()); + t.linkStates(e, suffix_paradigms[current_section][elements[i].paradigmName()], 0, default_weight); e = postsuffix_paradigms[current_section][elements[i].paradigmName()]; } else { - e = t.insertNewSingleTransduction(alphabet(0, 0), e, elements[i].entryWeight()); + e = t.insertNewSingleTransduction(alphabet(0, 0), e, default_weight); suffix_paradigms[current_section][elements[i].paradigmName()] = e; e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); postsuffix_paradigms[current_section][elements[i].paradigmName()] = e; @@ -849,14 +849,17 @@ Compiler::procEntry() if(name == COMPILER_PAIR_ELEM) { elements.push_back(procTransduction(weight)); + weight = 0.000; } else if(name == COMPILER_IDENTITY_ELEM) { elements.push_back(procIdentity(weight, false)); + weight = 0.000; } else if(name == COMPILER_IDENTITYGROUP_ELEM) { elements.push_back(procIdentity(weight, true)); + weight = 0.000; } else if(name == COMPILER_REGEXP_ELEM) { @@ -890,6 +893,12 @@ Compiler::procEntry() } else if(name == COMPILER_ENTRY_ELEM && type == XML_READER_TYPE_END_ELEMENT) { + if (weight > 0.000) { + EntryToken temp; + std::vector empty; + temp.setSingleTransduction(empty, empty, weight); + elements.push_back(temp); + } // insert elements into letter transducer insertEntryTokens(elements); return; diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 086b220a..9b57391e 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -261,6 +261,39 @@ Transducer::closure(int const state, std::set const &epsilon_tags) const return result; } +std::map> +Transducer::weighted_closure(const int state, const int epsilon_tag) const +{ + std::map result; + std::set todo; + result[state] = 0.000; + todo.insert(state); + + while (!todo.empty()) { + int st = *todo.begin(); + auto range = transitions.at(st).equal_range(epsilon_tag); + for (auto it = range.first; it != range.second; it++) { + double w = it->second.second + result[st]; + if (w < 0.000) { + w = 0.000; + // ignore negative epsilon cycles + } + if (result.find(it->second.first) == result.end() || + w < result[it->second.first]) { // we want the lowest weight path + todo.insert(it->second.first); + result[it->second.first] = w; + } + } + todo.erase(st); + } + + std::map> ret; + for (auto& it : result) { + ret[it.second].insert(it.first); + } + return ret; +} + void Transducer::joinFinals(int const epsilon_tag) { @@ -321,10 +354,30 @@ Transducer::determinize(int const epsilon_tag) std::map > > transitions_prime; unsigned int size_Q_prime = 0; - Q_prime[0] = closure(initial, epsilon_tag); - - Q_prime_inv[Q_prime[0]] = 0; - R[0].insert(0); + { + // If there are any weighted epsilon transitions from the initial state + // then we need to keep them so as to have the correct total weight. + auto init_prime = weighted_closure(initial, epsilon_tag); + if (init_prime.size() > 1 || init_prime.find(default_weight) == init_prime.end()) { + std::set empty; + Q_prime[0] = empty; + Q_prime_inv[empty] = 0; + transitions_prime[0].clear(); + for (auto& it : init_prime) { + int state = transitions_prime.size(); + Q_prime[state] = it.second; + Q_prime_inv[it.second] = state; + R[0].insert(state); + transitions_prime[state].clear(); + transitions_prime[0].insert(std::make_pair(epsilon_tag, + std::make_pair(state, it.first))); + } + } else { + Q_prime[0] = init_prime[default_weight]; + Q_prime_inv[Q_prime[0]] = 0; + R[0].insert(0); + } + } int initial_prime = 0; std::map finals_prime; @@ -366,11 +419,12 @@ Transducer::determinize(int const epsilon_tag) { if(it3.first != epsilon_tag) { - auto c = closure(it3.second.first, epsilon_tag); + auto c = weighted_closure(it3.second.first, epsilon_tag); for(auto& it4 : c) { - mymap[std::make_pair(it3.first, it3.second.second)].insert(it4); + auto& mp = mymap[std::make_pair(it3.first, it3.second.second+it4.first)]; + mp.insert(it4.second.begin(), it4.second.end()); } } } @@ -384,7 +438,7 @@ Transducer::determinize(int const epsilon_tag) int tag = Q_prime.size(); Q_prime[tag] = it2.second; Q_prime_inv[it2.second] = tag; - R[(t+1)%2].insert(Q_prime_inv[it2.second]); + R[(t+1)%2].insert(tag); transitions_prime[tag].clear(); } transitions_prime[it].insert(std::make_pair(it2.first.first, diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 154588e3..ab14f9a7 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -221,6 +221,14 @@ class Transducer */ std::set closure(int const state, std::set const &epsilon_tags) const; + /** + * Returns the epsilon closure of a given state with weights + * Each set in the returned map can be reached from state via + * 0 or more epsilon_tag transitions, where the key is the minimum + * non-negative sum of edge weights to get there. + */ + std::map> weighted_closure(const int state, const int epsilon_tag) const; + /** * Join all finals in one using epsilon transductions * @param epsilon_tag the tag to take as epsilon diff --git a/tests/data/more-entry-weights.dix b/tests/data/more-entry-weights.dix new file mode 100644 index 00000000..38628fd8 --- /dev/null +++ b/tests/data/more-entry-weights.dix @@ -0,0 +1,48 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + + + + + + + + + + + +

e e

+

e e

+

ed e

+

ing e

+

ing e

+

ing e

+

e e

+

es e

+

ed e

+
+ +

+

's 's

+

s

+

s' 's

+
+ +
+ +
+ house + hous +
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index 12bd1b1e..f5e7974b 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -419,5 +419,11 @@ class BufferIndex(ProcTest): "^ab/*ab$", ] +class ConflictingEntryWeights(ProcTest): + procflags = ['-W'] + procdix = 'data/more-entry-weights.dix' + inputs = ['house'] + expectedOutputs = ['^house/house/house/house/house$'] + # These fail on some systems: #from null_flush_invalid_stream_format import *