Skip to content

Commit

Permalink
Split multichar non-tag symbols into multiple transitions in ATT
Browse files Browse the repository at this point in the history
Multicharacter symbols (e.g. with combining diacritics) previously got
interpreted as tags, and thus got the first and last characters
removed on the assumption that they were `<` and `>`, leading to
nonsensical output in addition to the issue `lt-proc` completely
ignoring them on the input side.

Closes #111
  • Loading branch information
mr-martian committed Apr 22, 2022
1 parent 44c8c94 commit 6c2486a
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 48 deletions.
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ AC_PREREQ(2.52)

m4_define([PKG_VERSION_MAJOR], [3])
m4_define([PKG_VERSION_MINOR], [6])
m4_define([PKG_VERSION_PATCH], [3])
m4_define([PKG_VERSION_PATCH], [4])

AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox])

Expand Down
87 changes: 52 additions & 35 deletions lttoolbox/att_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@
using namespace std;
using namespace icu;

AttCompiler::AttCompiler() :
starting_state(0),
default_weight(0.0000)
AttCompiler::AttCompiler()
{}

AttCompiler::~AttCompiler()
Expand Down Expand Up @@ -87,40 +85,63 @@ AttCompiler::is_word_punct(UChar32 symbol)
return false;
}

/**
* Returns the code of the symbol in the alphabet. Run after convert_hfst has
* run.
*
* Also adds all non-multicharacter symbols (letters) to the @p letters set.
*
* @return the code of the symbol, if @p symbol is multichar; its first (and
* only) character otherwise.
*/
int
AttCompiler::symbol_code(const UString& symbol)
void
AttCompiler::update_alphabet(UChar32 c)
{
if (u_strHasMoreChar32Than(symbol.c_str(), -1, 1)) {
if (is_word_punct(c) || !(u_ispunct(c) && u_isspace(c))) {
letters.insert(c);
if(u_islower(c)) {
letters.insert(u_toupper(c));
} else if(u_isupper(c)) {
letters.insert(u_tolower(c));
}
}
}

void
AttCompiler::symbol_code(const UString& symbol, vector<int32_t>& split)
{
if (symbol.empty()) {
split.push_back(0);
} else if (symbol.size() >= 2 && symbol[0] == '<' && symbol.back() == '>') {
alphabet.includeSymbol(symbol);
return alphabet(symbol);
} else if (symbol.empty()) {
return 0;
split.push_back(alphabet(symbol));
} else {
size_t i = 0;
size_t end = symbol.size();
UChar32 c;
U16_GET(symbol, 0, 0, symbol.size(), c);
if ((u_ispunct(c) || u_isspace(c)) && !is_word_punct(c)) {
return c;
} else {
letters.insert(c);
if(u_islower(c)) {
letters.insert(u_toupper(c));
} else if(u_isupper(c)) {
letters.insert(u_tolower(c));
}
return c;
while (i < end) {
U16_NEXT(symbol.c_str(), i, end, c);
update_alphabet(c);
split.push_back(c);
}
}
}

void
AttCompiler::add_transition(int from, int to,
const UString& upper, const UString& lower,
double weight)
{
AttNode* src = get_node(from);
vector<int32_t> lsplit, rsplit;
symbol_code(upper, lsplit);
symbol_code(lower, rsplit);
for (size_t i = 0; i < lsplit.size() || i < rsplit.size(); i++) {
int32_t l = (lsplit.size() > i ? lsplit[i] : 0);
int32_t r = (rsplit.size() > i ? rsplit[i] : 0);
bool last = (i+1 >= lsplit.size() && i+1 >= rsplit.size());
int dest = (last ? to : -(++phantom_count));
UString ls, rs;
alphabet.getSymbol(ls, l);
alphabet.getSymbol(rs, r);
src->transductions.push_back(Transduction(dest, ls, rs, alphabet(l, r),
(last ? weight : default_weight)));
classify_single_transition(src->transductions.back());
src = get_node(dest);
}
}

void
AttCompiler::parse(string const &file_name, bool read_rl)
{
Expand Down Expand Up @@ -185,7 +206,7 @@ AttCompiler::parse(string const &file_name, bool read_rl)
from = StringUtils::stoi(tokens[0]) + state_id_offset;
largest_seen_state_id = max(largest_seen_state_id, from);

AttNode* source = get_node(from);
get_node(from);
/* First line: the initial state is of both types. */
if (first_line_in_fst)
{
Expand Down Expand Up @@ -226,7 +247,6 @@ AttCompiler::parse(string const &file_name, bool read_rl)
}
convert_hfst(upper);
convert_hfst(lower);
int tag = alphabet(symbol_code(upper), symbol_code(lower));
if(tokens.size() > 4)
{
weight = StringUtils::stod(tokens[4]);
Expand All @@ -235,10 +255,7 @@ AttCompiler::parse(string const &file_name, bool read_rl)
{
weight = default_weight;
}
source->transductions.push_back(Transduction(to, upper, lower, tag, weight));
classify_single_transition(source->transductions.back());

get_node(to);
add_transition(from, to, upper, lower, weight);
}
}

Expand Down
24 changes: 12 additions & 12 deletions lttoolbox/att_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,14 @@ class AttCompiler
* Id of the starting state. We assume it is the source state of the first
* transduction in the file.
*/
int starting_state;
int starting_state = 0;
/**
* Default value of weight of a transduction unless specified.
*/
double default_weight;
double default_weight = 0.0000;

// how many phantom states have we created to split multichar symbols
int phantom_count = 0;

Alphabet alphabet;
/** All non-multicharacter symbols. */
Expand Down Expand Up @@ -181,16 +184,13 @@ class AttCompiler
*/
void convert_hfst(UString& symbol);

/**
* Returns the code of the symbol in the alphabet. Run after convert_hfst has
* run.
*
* Also adds all non-multicharacter symbols (letters) to the @p letters set.
*
* @return the code of the symbol, if @p symbol is multichar; its first (and
* only) character otherwise.
*/
int symbol_code(const UString& symbol);
// if a character should be in the alphabet, add it
void update_alphabet(UChar32 c);
// convert a string to a symbol code, splitting non-tag multichars
void symbol_code(const UString& symbol, vector<int32_t>& split);
void add_transition(int from, int to,
const UString& upper, const UString& lower,
double weight);
};

#endif /* _MYATT_COMPILER_ */
3 changes: 3 additions & 0 deletions tests/data/multichar.att
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
0 1 א אַן
1 2 @0@ <blah>
2
5 changes: 5 additions & 0 deletions tests/lt_comp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ class CompAttEpsilonLoopShouldError(ProcTest):
class CompAttEpsilonToFinalShouldError(ProcTest):
procdix = "data/cat-epsilon-to-final.att"
expectedCompRetCodeFail = True

class CompSplitMultichar(ProcTest):
procdix = "data/multichar.att"
inputs = ["א"]
expectedOutputs = ["^א/אַן<blah>$"]
5 changes: 5 additions & 0 deletions tests/lt_print/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ class NegativeWeightedFst(unittest.TestCase, PrintTest):
printdix = "data/cat-weight-negative.att"
printdir = "lr"
expectedOutput = "0\t1\tc\tc\t4.567895\t\n1\t2\ta\ta\t0.989532\t\n2\t3\tt\tt\t2.796193\t\n3\t4\tε\t+\t-0.824564\t\n4\t5\tε\tn\t1.824564\t\n4\t5\tε\tv\t2.856296\t\n5\t-0.525487\n"

class MulticharCompFst(unittest.TestCase, PrintTest):
printdix = "data/multichar.att"
printdir = "lr"
expectedOutput = "0\t1\tא\tא\t0.000000\t\n1\t2\tε\tַ\t0.000000\t\n2\t3\tε\tן\t0.000000\t\n3\t4\tε\t<blah>\t0.000000\t\n4\t0.000000\n"

0 comments on commit 6c2486a

Please sign in to comment.