From 6c2486ab55ee3bb2b5ba1595c47681aa1d5e5bfe Mon Sep 17 00:00:00 2001
From: Daniel Swanson <awesomeevildudes@gmail.com>
Date: Fri, 22 Apr 2022 19:14:09 -0400
Subject: [PATCH] Split multichar non-tag symbols into multiple transitions in
 ATT

Multicharacter symbols (e.g. with combining diacritics) previously got
interpreted as tags, and thus got the first and last characters
removed on the assumption that they were `<` and `>`, leading to
nonsensical output in addition to the issue `lt-proc` completely
ignoring them on the input side.

Closes #111
---
 configure.ac               |  2 +-
 lttoolbox/att_compiler.cc  | 87 +++++++++++++++++++++++---------------
 lttoolbox/att_compiler.h   | 24 +++++------
 tests/data/multichar.att   |  3 ++
 tests/lt_comp/__init__.py  |  5 +++
 tests/lt_print/__init__.py |  5 +++
 6 files changed, 78 insertions(+), 48 deletions(-)
 create mode 100644 tests/data/multichar.att

diff --git a/configure.ac b/configure.ac
index dcabc22b..90ac2a6f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@ AC_PREREQ(2.52)
 
 m4_define([PKG_VERSION_MAJOR], [3])
 m4_define([PKG_VERSION_MINOR], [6])
-m4_define([PKG_VERSION_PATCH], [3])
+m4_define([PKG_VERSION_PATCH], [4])
 
 AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox])
 
diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc
index 16c922b7..98dca788 100644
--- a/lttoolbox/att_compiler.cc
+++ b/lttoolbox/att_compiler.cc
@@ -31,9 +31,7 @@
 using namespace std;
 using namespace icu;
 
-AttCompiler::AttCompiler() :
-starting_state(0),
-default_weight(0.0000)
+AttCompiler::AttCompiler()
 {}
 
 AttCompiler::~AttCompiler()
@@ -87,40 +85,63 @@ AttCompiler::is_word_punct(UChar32 symbol)
   return false;
 }
 
-/**
- * Returns the code of the symbol in the alphabet. Run after convert_hfst has
- * run.
- *
- * Also adds all non-multicharacter symbols (letters) to the @p letters set.
- *
- * @return the code of the symbol, if @p symbol is multichar; its first (and
- *         only) character otherwise.
- */
-int
-AttCompiler::symbol_code(const UString& symbol)
+void
+AttCompiler::update_alphabet(UChar32 c)
 {
-  if (u_strHasMoreChar32Than(symbol.c_str(), -1, 1)) {
+  if (is_word_punct(c) || !(u_ispunct(c) && u_isspace(c))) {
+    letters.insert(c);
+    if(u_islower(c)) {
+      letters.insert(u_toupper(c));
+    } else if(u_isupper(c)) {
+      letters.insert(u_tolower(c));
+    }
+  }
+}
+
+void
+AttCompiler::symbol_code(const UString& symbol, vector<int32_t>& split)
+{
+  if (symbol.empty()) {
+    split.push_back(0);
+  } else if (symbol.size() >= 2 && symbol[0] == '<' && symbol.back() == '>') {
     alphabet.includeSymbol(symbol);
-    return alphabet(symbol);
-  } else if (symbol.empty()) {
-    return 0;
+    split.push_back(alphabet(symbol));
   } else {
+    size_t i = 0;
+    size_t end = symbol.size();
     UChar32 c;
-    U16_GET(symbol, 0, 0, symbol.size(), c);
-    if ((u_ispunct(c) || u_isspace(c)) && !is_word_punct(c)) {
-      return c;
-    } else {
-      letters.insert(c);
-      if(u_islower(c)) {
-        letters.insert(u_toupper(c));
-      } else if(u_isupper(c)) {
-        letters.insert(u_tolower(c));
-      }
-      return c;
+    while (i < end) {
+      U16_NEXT(symbol.c_str(), i, end, c);
+      update_alphabet(c);
+      split.push_back(c);
     }
   }
 }
 
+void
+AttCompiler::add_transition(int from, int to,
+                            const UString& upper, const UString& lower,
+                            double weight)
+{
+  AttNode* src = get_node(from);
+  vector<int32_t> lsplit, rsplit;
+  symbol_code(upper, lsplit);
+  symbol_code(lower, rsplit);
+  for (size_t i = 0; i < lsplit.size() || i < rsplit.size(); i++) {
+    int32_t l = (lsplit.size() > i ? lsplit[i] : 0);
+    int32_t r = (rsplit.size() > i ? rsplit[i] : 0);
+    bool last = (i+1 >= lsplit.size() && i+1 >= rsplit.size());
+    int dest = (last ? to : -(++phantom_count));
+    UString ls, rs;
+    alphabet.getSymbol(ls, l);
+    alphabet.getSymbol(rs, r);
+    src->transductions.push_back(Transduction(dest, ls, rs, alphabet(l, r),
+                                              (last ? weight : default_weight)));
+    classify_single_transition(src->transductions.back());
+    src = get_node(dest);
+  }
+}
+
 void
 AttCompiler::parse(string const &file_name, bool read_rl)
 {
@@ -185,7 +206,7 @@ AttCompiler::parse(string const &file_name, bool read_rl)
     from = StringUtils::stoi(tokens[0]) + state_id_offset;
     largest_seen_state_id = max(largest_seen_state_id, from);
 
-    AttNode* source = get_node(from);
+    get_node(from);
     /* First line: the initial state is of both types. */
     if (first_line_in_fst)
     {
@@ -226,7 +247,6 @@ AttCompiler::parse(string const &file_name, bool read_rl)
       }
       convert_hfst(upper);
       convert_hfst(lower);
-      int tag = alphabet(symbol_code(upper), symbol_code(lower));
       if(tokens.size() > 4)
       {
         weight = StringUtils::stod(tokens[4]);
@@ -235,10 +255,7 @@ AttCompiler::parse(string const &file_name, bool read_rl)
       {
         weight = default_weight;
       }
-      source->transductions.push_back(Transduction(to, upper, lower, tag, weight));
-      classify_single_transition(source->transductions.back());
-
-      get_node(to);
+      add_transition(from, to, upper, lower, weight);
     }
   }
 
diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h
index 7b4f0e78..dc545394 100644
--- a/lttoolbox/att_compiler.h
+++ b/lttoolbox/att_compiler.h
@@ -98,11 +98,14 @@ class AttCompiler
    * Id of the starting state. We assume it is the source state of the first
    * transduction in the file.
    */
-  int starting_state;
+  int starting_state = 0;
   /**
    * Default value of weight of a transduction unless specified.
    */
-  double default_weight;
+  double default_weight = 0.0000;
+
+  // how many phantom states have we created to split multichar symbols
+  int phantom_count = 0;
 
   Alphabet alphabet;
   /** All non-multicharacter symbols. */
@@ -181,16 +184,13 @@ class AttCompiler
    */
   void convert_hfst(UString& symbol);
 
-  /**
-   * Returns the code of the symbol in the alphabet. Run after convert_hfst has
-   * run.
-   *
-   * Also adds all non-multicharacter symbols (letters) to the @p letters set.
-   *
-   * @return the code of the symbol, if @p symbol is multichar; its first (and
-   *         only) character otherwise.
-   */
-  int symbol_code(const UString& symbol);
+  // if a character should be in the alphabet, add it
+  void update_alphabet(UChar32 c);
+  // convert a string to a symbol code, splitting non-tag multichars
+  void symbol_code(const UString& symbol, vector<int32_t>& split);
+  void add_transition(int from, int to,
+                      const UString& upper, const UString& lower,
+                      double weight);
 };
 
 #endif /* _MYATT_COMPILER_ */
diff --git a/tests/data/multichar.att b/tests/data/multichar.att
new file mode 100644
index 00000000..4d219e4d
--- /dev/null
+++ b/tests/data/multichar.att
@@ -0,0 +1,3 @@
+0	1	א	אַן
+1	2	@0@	<blah>
+2
diff --git a/tests/lt_comp/__init__.py b/tests/lt_comp/__init__.py
index 31303d7f..53280a69 100644
--- a/tests/lt_comp/__init__.py
+++ b/tests/lt_comp/__init__.py
@@ -43,3 +43,8 @@ class CompAttEpsilonLoopShouldError(ProcTest):
 class CompAttEpsilonToFinalShouldError(ProcTest):
     procdix = "data/cat-epsilon-to-final.att"
     expectedCompRetCodeFail = True
+
+class CompSplitMultichar(ProcTest):
+    procdix = "data/multichar.att"
+    inputs = ["א"]
+    expectedOutputs = ["^א/אַן<blah>$"]
diff --git a/tests/lt_print/__init__.py b/tests/lt_print/__init__.py
index dd6b1299..7f78989f 100644
--- a/tests/lt_print/__init__.py
+++ b/tests/lt_print/__init__.py
@@ -19,3 +19,8 @@ class NegativeWeightedFst(unittest.TestCase, PrintTest):
     printdix = "data/cat-weight-negative.att"
     printdir = "lr"
     expectedOutput = "0\t1\tc\tc\t4.567895\t\n1\t2\ta\ta\t0.989532\t\n2\t3\tt\tt\t2.796193\t\n3\t4\tε\t+\t-0.824564\t\n4\t5\tε\tn\t1.824564\t\n4\t5\tε\tv\t2.856296\t\n5\t-0.525487\n"
+
+class MulticharCompFst(unittest.TestCase, PrintTest):
+    printdix = "data/multichar.att"
+    printdir = "lr"
+    expectedOutput = "0\t1\tא\tא\t0.000000\t\n1\t2\tε\tַ\t0.000000\t\n2\t3\tε\tן\t0.000000\t\n3\t4\tε\t<blah>\t0.000000\t\n4\t0.000000\n"