Skip to content

Commit

Permalink
handle weights more correctly (closes #44)
Browse files Browse the repository at this point in the history
* don't lose weights when minimizing
* don't lose weights while compiling
* don't duplicate weights while compiling
* ensure that final weight after joinFinals() is always 0
* version bump so lexd can depend on this
  • Loading branch information
mr-martian committed Jul 6, 2022
1 parent 655955b commit b040536
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 16 deletions.
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ AC_PREREQ(2.52)

m4_define([PKG_VERSION_MAJOR], [3])
m4_define([PKG_VERSION_MINOR], [6])
m4_define([PKG_VERSION_PATCH], [9])
m4_define([PKG_VERSION_PATCH], [10])

AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox])

Expand Down
23 changes: 16 additions & 7 deletions lttoolbox/compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ Compiler::matchTransduction(std::vector<int> const &pi,

if(pi.size() == 0 && pd.size() == 0)
{
state = t.insertNewSingleTransduction(alphabet(0, 0), state, default_weight);
state = t.insertNewSingleTransduction(alphabet(0, 0), state, entry_weight);
}
else
{
Expand Down Expand Up @@ -318,19 +318,19 @@ Compiler::matchTransduction(std::vector<int> const &pi,
right++;
}

double weight_value;
int new_state;
double weight_value = default_weight;

if(left == limleft && right == limright)
{
weight_value = entry_weight;
new_state = t.insertNewSingleTransduction(tag, state, entry_weight);
}
else
{
weight_value = default_weight;
new_state = t.insertSingleTransduction(tag, state, default_weight);
}

int new_state = t.insertSingleTransduction(tag, state, weight_value);

if (is_separable) {
// loop-back symbols for <ANY_TAG> and <ANY_CHAR>
if (tag == alphabet(0, any_tag) || tag == alphabet(0, any_char)) {
Expand Down Expand Up @@ -705,12 +705,12 @@ Compiler::insertEntryTokens(std::vector<EntryToken> const &elements)
// suffix paradigm
if(suffix_paradigms[current_section].find(elements[i].paradigmName()) != suffix_paradigms[current_section].end())
{
t.linkStates(e, suffix_paradigms[current_section][elements[i].paradigmName()], 0, elements[i].entryWeight());
t.linkStates(e, suffix_paradigms[current_section][elements[i].paradigmName()], 0, default_weight);
e = postsuffix_paradigms[current_section][elements[i].paradigmName()];
}
else
{
e = t.insertNewSingleTransduction(alphabet(0, 0), e, elements[i].entryWeight());
e = t.insertNewSingleTransduction(alphabet(0, 0), e, default_weight);
suffix_paradigms[current_section][elements[i].paradigmName()] = e;
e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]);
postsuffix_paradigms[current_section][elements[i].paradigmName()] = e;
Expand Down Expand Up @@ -849,14 +849,17 @@ Compiler::procEntry()
if(name == COMPILER_PAIR_ELEM)
{
elements.push_back(procTransduction(weight));
weight = 0.000;
}
else if(name == COMPILER_IDENTITY_ELEM)
{
elements.push_back(procIdentity(weight, false));
weight = 0.000;
}
else if(name == COMPILER_IDENTITYGROUP_ELEM)
{
elements.push_back(procIdentity(weight, true));
weight = 0.000;
}
else if(name == COMPILER_REGEXP_ELEM)
{
Expand Down Expand Up @@ -890,6 +893,12 @@ Compiler::procEntry()
}
else if(name == COMPILER_ENTRY_ELEM && type == XML_READER_TYPE_END_ELEMENT)
{
if (weight > 0.000) {
EntryToken temp;
std::vector<int> empty;
temp.setSingleTransduction(empty, empty, weight);
elements.push_back(temp);
}
// insert elements into letter transducer
insertEntryTokens(elements);
return;
Expand Down
75 changes: 68 additions & 7 deletions lttoolbox/transducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,39 @@ Transducer::closure(int const state, std::set<int> const &epsilon_tags) const
return result;
}

std::map<double, std::set<int>>
Transducer::weighted_closure(const int state, const int epsilon_tag) const
{
std::map<int, double> result;
std::set<int> todo;
result[state] = 0.000;
todo.insert(state);

while (!todo.empty()) {
int st = *todo.begin();
auto range = transitions.at(st).equal_range(epsilon_tag);
for (auto it = range.first; it != range.second; it++) {
double w = it->second.second + result[st];
if (w < 0.000) {
w = 0.000;
// ignore negative epsilon cycles
}
if (result.find(it->second.first) == result.end() ||
w < result[it->second.first]) { // we want the lowest weight path
todo.insert(it->second.first);
result[it->second.first] = w;
}
}
todo.erase(st);
}

std::map<double, std::set<int>> ret;
for (auto& it : result) {
ret[it.second].insert(it.first);
}
return ret;
}

void
Transducer::joinFinals(int const epsilon_tag)
{
Expand All @@ -281,6 +314,13 @@ Transducer::joinFinals(int const epsilon_tag)
std::cerr << "Error: empty set of final states" << std::endl;
exit(EXIT_FAILURE);
}
else if (finals.begin()->second > default_weight) {
int state = newState();
linkStates(finals.begin()->first, state, epsilon_tag,
finals.begin()->second);
finals.clear();
finals.insert(std::make_pair(state, default_weight));
}
}

bool
Expand Down Expand Up @@ -321,10 +361,30 @@ Transducer::determinize(int const epsilon_tag)
std::map<int, std::multimap<int, std::pair<int, double> > > transitions_prime;

unsigned int size_Q_prime = 0;
Q_prime[0] = closure(initial, epsilon_tag);

Q_prime_inv[Q_prime[0]] = 0;
R[0].insert(0);
{
// If there are any weighted epsilon transitions from the initial state
// then we need to keep them so as to have the correct total weight.
auto init_prime = weighted_closure(initial, epsilon_tag);
if (init_prime.size() > 1 || init_prime.find(default_weight) == init_prime.end()) {
std::set<int> empty;
Q_prime[0] = empty;
Q_prime_inv[empty] = 0;
transitions_prime[0].clear();
for (auto& it : init_prime) {
int state = transitions_prime.size();
Q_prime[state] = it.second;
Q_prime_inv[it.second] = state;
R[0].insert(state);
transitions_prime[state].clear();
transitions_prime[0].insert(std::make_pair(epsilon_tag,
std::make_pair(state, it.first)));
}
} else {
Q_prime[0] = init_prime[default_weight];
Q_prime_inv[Q_prime[0]] = 0;
R[0].insert(0);
}
}

int initial_prime = 0;
std::map<int, double> finals_prime;
Expand Down Expand Up @@ -366,11 +426,12 @@ Transducer::determinize(int const epsilon_tag)
{
if(it3.first != epsilon_tag)
{
auto c = closure(it3.second.first, epsilon_tag);
auto c = weighted_closure(it3.second.first, epsilon_tag);

for(auto& it4 : c)
{
mymap[std::make_pair(it3.first, it3.second.second)].insert(it4);
auto& mp = mymap[std::make_pair(it3.first, it3.second.second+it4.first)];
mp.insert(it4.second.begin(), it4.second.end());
}
}
}
Expand All @@ -384,7 +445,7 @@ Transducer::determinize(int const epsilon_tag)
int tag = Q_prime.size();
Q_prime[tag] = it2.second;
Q_prime_inv[it2.second] = tag;
R[(t+1)%2].insert(Q_prime_inv[it2.second]);
R[(t+1)%2].insert(tag);
transitions_prime[tag].clear();
}
transitions_prime[it].insert(std::make_pair(it2.first.first,
Expand Down
9 changes: 9 additions & 0 deletions lttoolbox/transducer.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,17 @@ class Transducer
*/
std::set<int> closure(int const state, std::set<int> const &epsilon_tags) const;

/**
* Returns the epsilon closure of a given state with weights
* Each set in the returned map can be reached from state via
* 0 or more epsilon_tag transitions, where the key is the minimum
* non-negative sum of edge weights to get there.
*/
std::map<double, std::set<int>> weighted_closure(const int state, const int epsilon_tag) const;

/**
* Join all finals in one using epsilon transductions
* Ensure that the resulting transducer has 1 final with weight 0
* @param epsilon_tag the tag to take as epsilon
*/
void joinFinals(int const epsilon_tag = 0);
Expand Down
48 changes: 48 additions & 0 deletions tests/data/more-entry-weights.dix
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<?xml version="1.0" encoding="UTF-8"?>
<dictionary>
<alphabet>ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­-</alphabet>
<sdefs>
<sdef n="n"/>
<sdef n="vblex"/>
<sdef n="m"/>
<sdef n="pr"/>
<sdef n="def"/>
<sdef n="inf"/>
<sdef n="imp"/>
<sdef n="pp"/>
<sdef n="pprs"/>
<sdef n="ger"/>
<sdef n="subs"/>
<sdef n="pres"/>
<sdef n="past"/>
<sdef n="p3"/>
<sdef n="sg"/>
<sdef n="pl"/>
<sdef n="gen"/>
</sdefs>
<pardefs>
<pardef n="liv/e__vblex">
<e w="1"> <p><l>e</l> <r>e<s n="vblex"/><s n="inf"/></r></p></e>
<e w="2"> <p><l>e</l> <r>e<s n="vblex"/><s n="imp"/></r></p></e>
<e> <p><l>ed</l> <r>e<s n="vblex"/><s n="pp"/></r></p></e>
<e w="1"> <p><l>ing</l> <r>e<s n="vblex"/><s n="pprs"/></r></p></e>
<e w="3"> <p><l>ing</l> <r>e<s n="vblex"/><s n="ger"/></r></p></e>
<e w="2"> <p><l>ing</l> <r>e<s n="vblex"/><s n="subs"/></r></p></e>
<e> <p><l>e</l> <r>e<s n="vblex"/><s n="pres"/></r></p></e>
<e> <p><l>es</l> <r>e<s n="vblex"/><s n="pres"/><s n="p3"/><s n="sg"/></r></p></e>
<e> <p><l>ed</l> <r>e<s n="vblex"/><s n="past"/></r></p></e>
</pardef>
<pardef n="house__n">
<e> <p><l></l> <r><s n="n"/><s n="sg"/></r></p></e>
<e r="RL"><p><l>'s</l> <r><s n="n"/><s n="sg"/><j/>'s<s n="gen"/></r></p></e>
<e> <p><l>s</l> <r><s n="n"/><s n="pl"/></r></p></e>
<e r="RL"><p><l>s'</l> <r><s n="n"/><s n="pl"/><j/>'s<s n="gen"/></r></p></e>
</pardef>

</pardefs>

<section id="main" type="standard">
<e lm="house" w="1"> <i>house</i><par n="house__n"/></e>
<e lm="house" w="2"> <i>hous</i><par n="liv/e__vblex"/></e>
</section>
</dictionary>
2 changes: 1 addition & 1 deletion tests/lt_print/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class NonWeightedFst(unittest.TestCase, PrintTest):
class WeightedFst(unittest.TestCase, PrintTest):
printdix = "data/cat-weight.att"
printdir = "lr"
expectedOutput = "0\t1\tc\tc\t4.567895\t\n1\t2\ta\ta\t0.989532\t\n2\t3\tt\tt\t2.796193\t\n3\t4\tε\t+\t0.824564\t\n4\t5\tε\tn\t1.824564\t\n4\t5\tε\tv\t2.856296\t\n5\t0.525487\n"
expectedOutput = "0\t1\tc\tc\t4.567895\t\n1\t2\ta\ta\t0.989532\t\n2\t3\tt\tt\t2.796193\t\n3\t4\tε\t+\t0.824564\t\n4\t5\tε\tn\t1.824564\t\n4\t5\tε\tv\t2.856296\t\n5\t6\tε\tε\t0.525487\t\n6\t0.000000\n"


class NegativeWeightedFst(unittest.TestCase, PrintTest):
Expand Down
6 changes: 6 additions & 0 deletions tests/lt_proc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,5 +419,11 @@ class BufferIndex(ProcTest):
"^ab/*ab$",
]

class ConflictingEntryWeights(ProcTest):
procflags = ['-W']
procdix = 'data/more-entry-weights.dix'
inputs = ['house']
expectedOutputs = ['^house/house<n><sg><W:1.000000>/house<vblex><pres><W:2.000000>/house<vblex><inf><W:3.000000>/house<vblex><imp><W:4.000000>$']

# These fail on some systems:
#from null_flush_invalid_stream_format import *

0 comments on commit b040536

Please sign in to comment.