From 9b99a07221992b4f17e31258f87efd037fbaae18 Mon Sep 17 00:00:00 2001 From: Techievena Date: Tue, 31 Jul 2018 16:33:23 +0530 Subject: [PATCH 1/2] lt-reweight: assign weights to a compiled transducer based on a corpus Write the utility to process tagged corpus and the binary lttoolbox file and return weighted analyses. Closes https://github.com/apertium/lttoolbox/issues/16 --- README | 45 ++++---- lttoolbox/Makefile.am | 8 +- lttoolbox/lt-reweight.1 | 66 ++++++++++++ lttoolbox/lt_comp.cc | 6 +- lttoolbox/lt_reweight.cc | 218 +++++++++++++++++++++++++++++++++++++++ lttoolbox/lt_trim.cc | 4 +- 6 files changed, 319 insertions(+), 28 deletions(-) create mode 100644 lttoolbox/lt-reweight.1 create mode 100644 lttoolbox/lt_reweight.cc diff --git a/README b/README index 1915429d..caf3d92f 100644 --- a/README +++ b/README @@ -2,32 +2,35 @@ README FILE OF LTTOOLBOX 1. Programs - lt-comp -> compiler, execute without parameters to show the - instructions of use. + lt-comp -> compiler, execute without parameters to show the + instructions of use. - lt-proc -> processor, works with options -a (lexical analyser, - default option), -g (lexical generator) and -p - (lexical post-generator). Using -h will show all - flags. + lt-proc -> processor, works with options -a (lexical analyser, + default option), -g (lexical generator) and -p + (lexical post-generator). Using -h will show all + flags. - lt-expand -> generates all the pairs of transductions of a given - dictionary. Execute without parameters to show the - instructions of use. + lt-expand -> generates all the pairs of transductions of a given + dictionary. Execute without parameters to show the + instructions of use. - lt-trim -> (experimental) trims a compiled analyser to only - contain entries which would pass through a compiled - bidix, creating a new compiled and trimmed analyser. + lt-trim -> (experimental) trims a compiled analyser to only + contain entries which would pass through a compiled + bidix, creating a new compiled and trimmed analyser. + + lt-reweight -> (experimental) utility to assign weights to a + compiled transducer based on a corpus. 2. Install - a. Requirements: - - i686, ppc, SPARC, etc. - - g++ >= 2.95 - - gnu make - - libxml2 (last version) + a. Requirements: + - i686, ppc, SPARC, etc. + - g++ >= 2.95 + - gnu make + - libxml2 (last version) - b. Building & installing - - ./configure - - make - - make install (o make-install script) + b. Building & installing + - ./configure + - make + - make install (o make-install script) diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 6500f1de..d7ff8760 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -14,7 +14,7 @@ cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token. library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) library_include_HEADERS = $(h_sources) -bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim +bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim lt-reweight instdir = lttoolbox lib_LTLIBRARIES= liblttoolbox3.la @@ -55,6 +55,10 @@ lt_tmxproc_SOURCES = lt_tmxproc.cc lt_tmxproc_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS) +lt_reweight_SOURCES = lt_reweight.cc +lt_reweight_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la +lt_reweight_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS) + #lt-validate-dictionary: Makefile.am validate-header.sh # @echo "Creating lt-validate-dictionary script" # @echo "#!$(BASH)" > $@ @@ -65,7 +69,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS) -man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 +man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-reweight.1 INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS) if WINDOWS diff --git a/lttoolbox/lt-reweight.1 b/lttoolbox/lt-reweight.1 new file mode 100644 index 00000000..8d5bfe68 --- /dev/null +++ b/lttoolbox/lt-reweight.1 @@ -0,0 +1,66 @@ +.TH lt-reweight 1 2014-02-07 "" "" +.SH NAME +lt-reweight \- This application is part of the lexical processing modules +and tools ( +.B lttoolbox +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B lt-trim +analyser_binary bidix_binary trimmed_analyser_binary +.PP +.SH DESCRIPTION +.BR lt-trim +is the application responsible for trimming compiled dictionaries. The +analyses (right-side when compiling lr) of analyser_binary are trimmed +to the input side of bidix_binary (left-side when compiling lr, +right-side when compiling rl), such that only analyses which would +pass through `lt-proc \-b bidix_binary' are kept. + +\fBWarning: this program is experimental!\fR It has been tested, but +not deployed extensively yet. + +Both compund tags (`', `') and join +elements (`' in XML, `+' in the stream) and the group element +(`' in XML, `#' in the stream) should be handled correctly, even +combinations of + followed by # in monodix are handled. + +Some minor caveats: If you have the capitalised lemma "Foo" in the +monodix, but "foo" in the bidix, an analysis "^Foo$" would pass +through bidix when doing lt-proc \-b, but will not make it through +trimming. Make sure your lemmas have the same capitalisation in the +different dictionaries. Also, you should not have literal `+' or `#' +in your lemmas. Since lt-comp doesn't escape these, lt-trim cannot +know that they are different from `' or `', and you may get +@-marked output this way. You can analyse `+' or `#' by having the +literal symbol in the `' part and some other string (e.g. "plus") +in the `'. + +You should not trim a generator unless you have a \fBvery\fR simple +translator pipeline, since the output of bidix seldom goes unchanged +through transfer. +.PP +.SH FILES +.B analyser_binary +The untrimmed analyser dictionary (a finite state transducer). +.PP +.B bidix_binary +The dictionary to use as trimmer (a finite state transducer). +.PP +.B trimmed_analyser_binary +The trimmed analyser dictionary (a finite state transducer). + +.SH SEE ALSO +.I lt-comp\fR(1), +.I lt-proc\fR(1), +.I lt-print\fR(1), +.I lt-expand\fR(1), +.I lt-trim\fR(1), +.I apertium-tagger\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2013--2014 Universitat d'Alacant / Universidad de Alicante. diff --git a/lttoolbox/lt_comp.cc b/lttoolbox/lt_comp.cc index 04358e6f..f60b0262 100644 --- a/lttoolbox/lt_comp.cc +++ b/lttoolbox/lt_comp.cc @@ -36,7 +36,7 @@ using namespace std; void errorFunc(void *ctx, const char *msg, ...) { return; -} +} void endProgram(char *name) { @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } initGenericErrorDefaultFunc(NULL); - + if(opc == "lr") { @@ -218,7 +218,7 @@ int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL); - } + } } else { diff --git a/lttoolbox/lt_reweight.cc b/lttoolbox/lt_reweight.cc new file mode 100644 index 00000000..3afec7db --- /dev/null +++ b/lttoolbox/lt_reweight.cc @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +void endProgram(char *name) +{ + if(name != NULL) + { + cout << basename(name) << " v" << PACKAGE_VERSION <<": assign weights to a compiled transducer based on a corpus." << endl; + cout << "USAGE: " << basename(name) << " analyser_bin_file tagged_corpus" << endl; + } + exit(EXIT_FAILURE); +} + +std::pair, std::map > +read_fst(FILE *bin_file) +{ + Alphabet new_alphabet; + wstring letters = L""; + + std::map transducers; + + // letters + int len = Compression::multibyte_read(bin_file); + while(len > 0) + { + letters.push_back(static_cast(Compression::multibyte_read(bin_file))); + len--; + } + + // symbols + new_alphabet.read(bin_file); + + len = Compression::multibyte_read(bin_file); + + while(len > 0) + { + int len2 = Compression::multibyte_read(bin_file); + wstring name = L""; + while(len2 > 0) + { + name += static_cast(Compression::multibyte_read(bin_file)); + len2--; + } + transducers[name].read(bin_file); + + len--; + } + + std::pair alph_letters; + alph_letters.first = new_alphabet; + alph_letters.second = letters; + return std::pair, std::map > (alph_letters, transducers); +} + +std::pair, std::map > +trim(FILE *file_mono, FILE *file_bi) +{ + std::pair, std::map > alph_trans_mono = read_fst(file_mono); + Alphabet alph_mono = alph_trans_mono.first.first; + std::map trans_mono = alph_trans_mono.second; + std::pair, std::map > alph_trans_bi = read_fst(file_bi); + Alphabet alph_bi = alph_trans_bi.first.first; + std::map trans_bi = alph_trans_bi.second; + + // The prefix transducer is the union of all transducers from bidix, + // with a ".*" appended + Transducer union_transducer; + // The "." in ".*" is a set of equal pairs of the output symbols + // from the monodix alphabet (: etc.) + Alphabet alph_prefix = alph_bi; + set loopback_symbols; // ints refer to alph_prefix + alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); + + for(std::map::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) + { + Transducer union_tmp = it->second; + if(union_transducer.isEmpty()) + { + union_transducer = union_tmp; + } + else + { + union_transducer.unionWith(alph_bi, union_tmp); + } + } + union_transducer.minimize(); + + Transducer prefix_transducer = union_transducer.appendDotStar(loopback_symbols); + // prefix_transducer should _not_ be minimized (both useless and takes forever) + Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); + + + for(std::map::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) + { + Transducer trimmed = it->second.intersect(moved_transducer, + alph_mono, + alph_prefix); + + wcout << it->first << " " << it->second.size(); + wcout << " " << it->second.numberOfTransitions() << endl; + if(it->second.numberOfTransitions() == 0) + { + wcerr << L"Warning: empty section! Skipping it ..."<first].clear(); + } + else if(trimmed.hasNoFinals()) { + wcerr << L"Warning: section had no final state after trimming! Skipping it ..."<first].clear(); + } + else { + trimmed.minimize(); + trans_mono[it->first] = trimmed; + } + } + + alph_trans_mono.second = trans_mono; + return alph_trans_mono; +} + + +int main(int argc, char *argv[]) +{ + if(argc != 3) + { + endProgram(argv[0]); + } + + LtLocale::tryToSetLocale(); + + FILE *analyser = fopen(argv[1], "rb"); + if(!analyser) + { + wcerr << "Error: Cannot not open file '" << argv[1] << "'." << endl << endl; + exit(EXIT_FAILURE); + } + FILE *bidix = fopen(argv[2], "rb"); + if(!bidix) + { + wcerr << "Error: Cannot not open file '" << argv[2] << "'." << endl << endl; + exit(EXIT_FAILURE); + } + + std::pair, std::map > trimmed = trim(analyser, bidix); + Alphabet alph_t = trimmed.first.first; + wstring letters = trimmed.first.second; + std::map trans_t = trimmed.second; + + int n_transducers = 0; + for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) + { + if(!(it->second.isEmpty())) + { + n_transducers++; + } + } + + if(n_transducers == 0) + { + wcerr << L"Error: Trimming gave empty transducer!" << endl; + exit(EXIT_FAILURE); + } + + // Write the file: + FILE *output = fopen(argv[3], "wb"); + if(!output) + { + wcerr << "Error: Cannot not open file '" << argv[3] << "'." << endl << endl; + exit(EXIT_FAILURE); + } + + // letters + Compression::wstring_write(letters, output); + + // symbols + alph_t.write(output); + + // transducers + Compression::multibyte_write(n_transducers, output); + for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) + { + if(!(it->second.isEmpty())) + { + Compression::wstring_write(it->first, output); + it->second.write(output); + } + } + + fclose(analyser); + fclose(bidix); + fclose(output); + + return 0; +} diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index 95cd4161..f0d89c96 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -155,13 +155,13 @@ int main(int argc, char *argv[]) FILE *analyser = fopen(argv[1], "rb"); if(!analyser) { - wcerr << "Error: Cannot not open file '" << argv[1] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[1] << "'." << endl << endl; exit(EXIT_FAILURE); } FILE *bidix = fopen(argv[2], "rb"); if(!bidix) { - wcerr << "Error: Cannot not open file '" << argv[2] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl << endl; exit(EXIT_FAILURE); } From 6552c58bf523adafb68374f4b6c6f0595dc58328 Mon Sep 17 00:00:00 2001 From: Francis Tyers Date: Sun, 5 Jul 2020 21:52:38 +0100 Subject: [PATCH 2/2] deprecate SAO code --- lttoolbox/Makefile.am | 4 ++-- lttoolbox/lt_proc.cc | 18 +++++++++++------- lttoolbox/lt_reweight.cc | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 0424655e..cc24cb18 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -56,8 +56,8 @@ lt_tmxproc_LDADD = liblttoolbox$(VERSION_MAJOR).la lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) lt_reweight_SOURCES = lt_reweight.cc -lt_reweight_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la -lt_reweight_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS) +lt_reweight_LDADD = liblttoolbox$(VERSION_MAJOR).la +lt_reweight_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) #lt-validate-dictionary: Makefile.am validate-header.sh # @echo "Creating lt-validate-dictionary script" diff --git a/lttoolbox/lt_proc.cc b/lttoolbox/lt_proc.cc index 0d461c94..f54ff6fa 100644 --- a/lttoolbox/lt_proc.cc +++ b/lttoolbox/lt_proc.cc @@ -54,7 +54,8 @@ void endProgram(char *name) cout << " -o, --surf-bilingual: lexical transfer with surface forms" << endl; cout << " -p, --post-generation: post-generation" << endl; cout << " -x, --inter-generation: inter-generation" << endl; - cout << " -s, --sao: SAO annotation system input processing" << endl; +// Deprecated: +// cout << " -s, --sao: SAO annotation system input processing" << endl; cout << " -t, --transliteration: apply transliteration dictionary" << endl; cout << " -v, --version: version" << endl; cout << " -z, --null-flush: flush output on the null character " << endl; @@ -79,7 +80,8 @@ void endProgram(char *name) cout << " -o: lexical transfer with surface forms" << endl; cout << " -p: post-generation" << endl; cout << " -x: inter-generation" << endl; - cout << " -s: SAO annotation system input processing" << endl; +// Deprecated: +// cout << " -s: SAO annotation system input processing" << endl; cout << " -t: apply transliteration dictionary" << endl; cout << " -v: version" << endl; cout << " -z: flush output on the null character " << endl; @@ -124,7 +126,8 @@ int main(int argc, char *argv[]) {"tagged-nm-gen", 0, 0, 'm'}, {"post-generation", 0, 0, 'p'}, {"inter-generation", 0, 0, 'x'}, - {"sao", 0, 0, 's'}, +// Deprecated: +// {"sao", 0, 0, 's'}, {"transliteration", 0, 0, 't'}, {"null-flush", 0, 0, 'z'}, {"dictionary-case", 0, 0, 'w'}, @@ -143,9 +146,9 @@ int main(int argc, char *argv[]) { #if HAVE_GETOPT_LONG int option_index; - int c = getopt_long(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h", long_options, &option_index); + int c = getopt_long(argc, argv, "abcegi:r:lmndopxtzwvCIWN:L:h", long_options, &option_index); #else - int c = getopt(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h"); + int c = getopt(argc, argv, "abcegi:r:lmndopxtzwvCIWN:L:h"); #endif if(c == -1) @@ -210,7 +213,6 @@ int main(int argc, char *argv[]) case 'p': case 'x': case 't': - case 's': case 'C': if(cmd == 0) { @@ -362,12 +364,14 @@ int main(int argc, char *argv[]) fstp.intergeneration(input, output); break; +/** Deprecated: + case 's': fstp.initAnalysis(); checkValidity(fstp); fstp.SAO(input, output); break; - +*/ case 't': fstp.initPostgeneration(); checkValidity(fstp); diff --git a/lttoolbox/lt_reweight.cc b/lttoolbox/lt_reweight.cc index 3afec7db..3a1c234e 100644 --- a/lttoolbox/lt_reweight.cc +++ b/lttoolbox/lt_reweight.cc @@ -16,7 +16,7 @@ */ #include #include -#include +//#include #include #include