lt-reweight: assign weights to a compiled transducer based on a corpus

Write the utility to process tagged corpus and the binary lttoolbox file and return weighted analyses. Closes apertium#16
Techievena · Jul 14, 2018 · 847e2cc · 847e2cc
1 parent 1b44b8a
commit 847e2cc
Show file tree

Hide file tree

Showing 6 changed files with 319 additions and 28 deletions.
diff --git a/README b/README
@@ -2,32 +2,35 @@ README FILE OF LTTOOLBOX
 
 1. Programs
 
-	lt-comp   -> compiler, execute without parameters to show the
-                     instructions of use.
+    lt-comp     -> compiler, execute without parameters to show the
+                   instructions of use.
 
-	lt-proc   -> processor, works with options -a (lexical analyser, 
-                     default option), -g (lexical generator) and -p 
-                    (lexical post-generator).  Using -h will show all 
-                    flags.
+    lt-proc     -> processor, works with options -a (lexical analyser, 
+                   default option), -g (lexical generator) and -p 
+                   (lexical post-generator).  Using -h will show all 
+                   flags.
 
-	lt-expand -> generates all the pairs of transductions of a given
-	             dictionary.  Execute without parameters to show the
-                     instructions of use.
+    lt-expand   -> generates all the pairs of transductions of a given
+                   dictionary.  Execute without parameters to show the
+                   instructions of use.
 
-	lt-trim   -> (experimental) trims a compiled analyser to only
-	             contain entries which would pass through a compiled
-		     bidix, creating a new compiled and trimmed analyser.
+    lt-trim     -> (experimental) trims a compiled analyser to only
+                   contain entries which would pass through a compiled
+                   bidix, creating a new compiled and trimmed analyser.
+
+    lt-reweight -> (experimental) utility to assign weights to a
+                   compiled transducer based on a corpus.
 
 
 2. Install
 
-	a. Requirements:
-	        - i686, ppc, SPARC, etc.
-		- g++ >= 2.95
-		- gnu make
-		- libxml2 (last version)
+    a. Requirements:
+        - i686, ppc, SPARC, etc.
+        - g++ >= 2.95
+        - gnu make
+        - libxml2 (last version)
 
-	b. Building & installing
-		- ./configure
-		- make
-		- make install (o make-install script)
+    b. Building & installing
+        - ./configure
+        - make
+        - make install (o make-install script)
diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am
@@ -14,7 +14,7 @@ cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.
 library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)
 library_include_HEADERS = $(h_sources)
 
-bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim
+bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim lt-reweight
 instdir = lttoolbox
 
 lib_LTLIBRARIES= liblttoolbox3.la
@@ -55,6 +55,10 @@ lt_tmxproc_SOURCES = lt_tmxproc.cc
 lt_tmxproc_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
 lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)
 
+lt_reweight_SOURCES = lt_reweight.cc
+lt_reweight_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
+lt_reweight_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)
+
 #lt-validate-dictionary: Makefile.am validate-header.sh
 #	@echo "Creating lt-validate-dictionary script"
 #	@echo "#!$(BASH)" > $@
@@ -65,7 +69,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)
 
 
 
-man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1
+man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-reweight.1
 
 INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS)
 if WINDOWS

diff --git a/lttoolbox/lt-reweight.1 b/lttoolbox/lt-reweight.1
@@ -0,0 +1,66 @@
+.TH lt-reweight 1 2014-02-07 "" ""
+.SH NAME
+lt-reweight \- This application is part of the lexical processing modules
+and tools (
+.B lttoolbox
+)
+.PP
+This tool is part of the apertium machine translation
+architecture: \fBhttp://www.apertium.org\fR.
+.SH SYNOPSIS
+.B lt-trim
+analyser_binary bidix_binary trimmed_analyser_binary
+.PP
+.SH DESCRIPTION
+.BR lt-trim
+is the application responsible for trimming compiled dictionaries. The
+analyses (right-side when compiling lr) of analyser_binary are trimmed
+to the input side of bidix_binary (left-side when compiling lr,
+right-side when compiling rl), such that only analyses which would
+pass through `lt-proc \-b bidix_binary' are kept.
+
+\fBWarning: this program is experimental!\fR It has been tested, but
+not deployed extensively yet.
+
+Both compund tags (`<compound-only-L>', `<compound-R>') and join
+elements (`<j/>' in XML, `+' in the stream) and the group element
+(`<g/>' in XML, `#' in the stream) should be handled correctly, even
+combinations of + followed by # in monodix are handled.
+
+Some minor caveats: If you have the capitalised lemma "Foo" in the
+monodix, but "foo" in the bidix, an analysis "^Foo<tag>$" would pass
+through bidix when doing lt-proc \-b, but will not make it through
+trimming. Make sure your lemmas have the same capitalisation in the
+different dictionaries. Also, you should not have literal `+' or `#'
+in your lemmas. Since lt-comp doesn't escape these, lt-trim cannot
+know that they are different from `<j/>' or `<g/>', and you may get
+@-marked output this way. You can analyse `+' or `#' by having the
+literal symbol in the `<l>' part and some other string (e.g. "plus")
+in the `<r>'.
+
+You should not trim a generator unless you have a \fBvery\fR simple
+translator pipeline, since the output of bidix seldom goes unchanged
+through transfer.
+.PP
+.SH FILES
+.B analyser_binary
+The untrimmed analyser dictionary (a finite state transducer).
+.PP
+.B bidix_binary
+The dictionary to use as trimmer (a finite state transducer).
+.PP
+.B trimmed_analyser_binary
+The trimmed analyser dictionary (a finite state transducer).
+
+.SH SEE ALSO
+.I lt-comp\fR(1),
+.I lt-proc\fR(1),
+.I lt-print\fR(1),
+.I lt-expand\fR(1),
+.I lt-trim\fR(1),
+.I apertium-tagger\fR(1),
+.I apertium\fR(1).
+.SH BUGS
+Lots of...lurking in the dark and waiting for you!
+.SH AUTHOR
+(c) 2013--2014 Universitat d'Alacant / Universidad de Alicante.
diff --git a/lttoolbox/lt_comp.cc b/lttoolbox/lt_comp.cc
@@ -36,7 +36,7 @@ using namespace std;
 void errorFunc(void *ctx, const char *msg, ...) 
 {
   return;
-} 
+}
 
 void endProgram(char *name)
 {
@@ -166,7 +166,7 @@ int main(int argc, char *argv[])
     exit(EXIT_FAILURE);
   }
   initGenericErrorDefaultFunc(NULL);
-  
+
 
   if(opc == "lr")
   {
@@ -218,7 +218,7 @@ int main(int argc, char *argv[])
     {
       LtLocale::tryToSetLocale();
       c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL);
-    }    
+    }
   }
   else
   {

diff --git a/lttoolbox/lt_reweight.cc b/lttoolbox/lt_reweight.cc
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <lttoolbox/transducer.h>
+#include <lttoolbox/compression.h>
+#include <lttoolbox/lttoolbox_config.h>
+
+#include <lttoolbox/my_stdio.h>
+#include <lttoolbox/lt_locale.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <libgen.h>
+#include <string>
+
+void endProgram(char *name)
+{
+  if(name != NULL)
+  {
+    cout << basename(name) << " v" << PACKAGE_VERSION <<": assign weights to a compiled transducer based on a corpus." << endl;
+    cout << "USAGE: " << basename(name) << " analyser_bin_file tagged_corpus" << endl;
+  }
+  exit(EXIT_FAILURE);
+}
+
+std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> >
+read_fst(FILE *bin_file)
+{
+  Alphabet new_alphabet;
+  wstring letters = L"";
+
+  std::map<wstring, Transducer> transducers;
+
+  // letters
+  int len = Compression::multibyte_read(bin_file);
+  while(len > 0)
+  {
+    letters.push_back(static_cast<wchar_t>(Compression::multibyte_read(bin_file)));
+    len--;
+  }
+
+  // symbols
+  new_alphabet.read(bin_file);
+
+  len = Compression::multibyte_read(bin_file);
+
+  while(len > 0)
+  {
+    int len2 = Compression::multibyte_read(bin_file);
+    wstring name = L"";
+    while(len2 > 0)
+    {
+      name += static_cast<wchar_t>(Compression::multibyte_read(bin_file));
+      len2--;
+    }
+    transducers[name].read(bin_file);
+
+    len--;
+  }
+
+  std::pair<Alphabet, wstring> alph_letters;
+  alph_letters.first = new_alphabet;
+  alph_letters.second = letters;
+  return std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > (alph_letters, transducers);
+}
+
+std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> >
+trim(FILE *file_mono, FILE *file_bi)
+{
+  std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > alph_trans_mono = read_fst(file_mono);
+  Alphabet alph_mono = alph_trans_mono.first.first;
+  std::map<wstring, Transducer> trans_mono = alph_trans_mono.second;
+  std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > alph_trans_bi = read_fst(file_bi);
+  Alphabet alph_bi = alph_trans_bi.first.first;
+  std::map<wstring, Transducer> trans_bi = alph_trans_bi.second;
+
+  // The prefix transducer is the union of all transducers from bidix,
+  // with a ".*" appended
+  Transducer union_transducer;
+  // The "." in ".*" is a set of equal pairs of the output symbols
+  // from the monodix alphabet (<n>:<n> etc.)
+  Alphabet alph_prefix = alph_bi;
+  set<int> loopback_symbols;    // ints refer to alph_prefix
+  alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right);
+
+  for(std::map<wstring, Transducer>::iterator it = trans_bi.begin(); it != trans_bi.end(); it++)
+  {
+    Transducer union_tmp = it->second;
+    if(union_transducer.isEmpty())
+    {
+      union_transducer = union_tmp;
+    }
+    else
+    {
+      union_transducer.unionWith(alph_bi, union_tmp);
+    }
+  }
+  union_transducer.minimize();
+
+  Transducer prefix_transducer = union_transducer.appendDotStar(loopback_symbols);
+  // prefix_transducer should _not_ be minimized (both useless and takes forever)
+  Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix);
+
+
+  for(std::map<wstring, Transducer>::iterator it = trans_mono.begin(); it != trans_mono.end(); it++)
+  {
+    Transducer trimmed = it->second.intersect(moved_transducer,
+                                              alph_mono,
+                                              alph_prefix);
+
+    wcout << it->first << " " << it->second.size();
+    wcout << " " << it->second.numberOfTransitions() << endl;
+    if(it->second.numberOfTransitions() == 0)
+    {
+      wcerr << L"Warning: empty section! Skipping it ..."<<endl;
+      trans_mono[it->first].clear();
+    }
+    else if(trimmed.hasNoFinals()) {
+      wcerr << L"Warning: section had no final state after trimming! Skipping it ..."<<endl;
+      trans_mono[it->first].clear();
+    }
+    else {
+      trimmed.minimize();
+      trans_mono[it->first] = trimmed;
+    }
+  }
+
+  alph_trans_mono.second = trans_mono;
+  return alph_trans_mono;
+}
+
+
+int main(int argc, char *argv[])
+{
+  if(argc != 3)
+  {
+    endProgram(argv[0]);
+  }
+
+  LtLocale::tryToSetLocale();
+
+  FILE *analyser = fopen(argv[1], "rb");
+  if(!analyser)
+  {
+    wcerr << "Error: Cannot not open file '" << argv[1] << "'." << endl << endl;
+    exit(EXIT_FAILURE);
+  }
+  FILE *bidix = fopen(argv[2], "rb");
+  if(!bidix)
+  {
+    wcerr << "Error: Cannot not open file '" << argv[2] << "'." << endl << endl;
+    exit(EXIT_FAILURE);
+  }
+
+  std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > trimmed = trim(analyser, bidix);
+  Alphabet alph_t = trimmed.first.first;
+  wstring letters = trimmed.first.second;
+  std::map<wstring, Transducer> trans_t = trimmed.second;
+
+  int n_transducers = 0;
+  for(std::map<wstring, Transducer>::iterator it = trans_t.begin(); it != trans_t.end(); it++)
+  {
+    if(!(it->second.isEmpty()))
+    {
+      n_transducers++;
+    }
+  }
+
+  if(n_transducers == 0)
+  {
+    wcerr << L"Error: Trimming gave empty transducer!" << endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // Write the file:
+  FILE *output = fopen(argv[3], "wb");
+  if(!output)
+  {
+    wcerr << "Error: Cannot not open file '" << argv[3] << "'." << endl << endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // letters
+  Compression::wstring_write(letters, output);
+
+  // symbols
+  alph_t.write(output);
+
+  // transducers
+  Compression::multibyte_write(n_transducers, output);
+  for(std::map<wstring, Transducer>::iterator it = trans_t.begin(); it != trans_t.end(); it++)
+  {
+    if(!(it->second.isEmpty()))
+    {
+      Compression::wstring_write(it->first, output);
+      it->second.write(output);
+    }
+  }
+
+  fclose(analyser);
+  fclose(bidix);
+  fclose(output);
+
+  return 0;
+}