Skip to content

Commit

Permalink
lt-reweight: assign weights to a compiled transducer based on a corpus
Browse files Browse the repository at this point in the history
Write the utility to process tagged corpus and the binary lttoolbox
file and return weighted analyses.

Closes apertium#16
  • Loading branch information
Techievena committed Jul 5, 2018
1 parent 1b44b8a commit 444c595
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 26 deletions.
45 changes: 24 additions & 21 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,35 @@ README FILE OF LTTOOLBOX

1. Programs

lt-comp -> compiler, execute without parameters to show the
instructions of use.
lt-comp -> compiler, execute without parameters to show the
instructions of use.

lt-proc -> processor, works with options -a (lexical analyser,
default option), -g (lexical generator) and -p
(lexical post-generator). Using -h will show all
flags.
lt-proc -> processor, works with options -a (lexical analyser,
default option), -g (lexical generator) and -p
(lexical post-generator). Using -h will show all
flags.

lt-expand -> generates all the pairs of transductions of a given
dictionary. Execute without parameters to show the
instructions of use.
lt-expand -> generates all the pairs of transductions of a given
dictionary. Execute without parameters to show the
instructions of use.

lt-trim -> (experimental) trims a compiled analyser to only
contain entries which would pass through a compiled
bidix, creating a new compiled and trimmed analyser.
lt-trim -> (experimental) trims a compiled analyser to only
contain entries which would pass through a compiled
bidix, creating a new compiled and trimmed analyser.

lt-reweight -> (experimental) utility to assign weights to a
compiled transducer based on a corpus.


2. Install

a. Requirements:
- i686, ppc, SPARC, etc.
- g++ >= 2.95
- gnu make
- libxml2 (last version)
a. Requirements:
- i686, ppc, SPARC, etc.
- g++ >= 2.95
- gnu make
- libxml2 (last version)

b. Building & installing
- ./configure
- make
- make install (o make-install script)
b. Building & installing
- ./configure
- make
- make install (o make-install script)
8 changes: 6 additions & 2 deletions lttoolbox/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.
library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)
library_include_HEADERS = $(h_sources)

bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim
bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim lt-reweight
instdir = lttoolbox

lib_LTLIBRARIES= liblttoolbox3.la
Expand Down Expand Up @@ -55,6 +55,10 @@ lt_tmxproc_SOURCES = lt_tmxproc.cc
lt_tmxproc_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)

lt_reweight_SOURCES = lt_reweight.cc
lt_reweight_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
lt_reweight_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)

#lt-validate-dictionary: Makefile.am validate-header.sh
# @echo "Creating lt-validate-dictionary script"
# @echo "#!$(BASH)" > $@
Expand All @@ -65,7 +69,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)



man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1
man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-reweight.1

INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS)
if WINDOWS
Expand Down
6 changes: 3 additions & 3 deletions lttoolbox/lt_comp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ using namespace std;
void errorFunc(void *ctx, const char *msg, ...)
{
return;
}
}

void endProgram(char *name)
{
Expand Down Expand Up @@ -166,7 +166,7 @@ int main(int argc, char *argv[])
exit(EXIT_FAILURE);
}
initGenericErrorDefaultFunc(NULL);


if(opc == "lr")
{
Expand Down Expand Up @@ -218,7 +218,7 @@ int main(int argc, char *argv[])
{
LtLocale::tryToSetLocale();
c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL);
}
}
}
else
{
Expand Down
238 changes: 238 additions & 0 deletions lttoolbox/lt_reweight.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
/*
* Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <lttoolbox/compiler.h>
#include <lttoolbox/att_compiler.h>
#include <lttoolbox/lttoolbox_config.h>
#include <lttoolbox/lt_locale.h>
#include <lttoolbox/string_to_wostream.h>

#include <cstdlib>
#include <iostream>
#include <libgen.h>
#include <string>
#include <getopt.h>

using namespace std;

void errorFunc(void *ctx, const char *msg, ...)
{
return;
}

void endProgram(char *name)
{
if(name != NULL)
{
cout << basename(name) << " v" << PACKAGE_VERSION <<": assign weights to a compiled transducer based on a corpus." << endl;
cout << "USAGE: " << basename(name) << " [-avh] lr | rl binary_lttoolbox_file tagged_corpus" << endl;
cout << " -v: set language variant" << endl;
cout << " -a: set alternative (monodix)" << endl;
cout << " -l: set left language variant (bidix)" << endl;
cout << " -r: set right language variant (bidix)" << endl;
cout << "Modes:" << endl;
cout << " lr: left-to-right compilation" << endl;
cout << " rl: right-to-left compilation" << endl;
}
exit(EXIT_FAILURE);
}


int main(int argc, char *argv[])
{
char ttype = 'x';
Compiler c;
AttCompiler a;
c.setVerbose(false);

#if HAVE_GETOPT_LONG
int option_index=0;
#endif

string vl;
string vr;

while (true) {
#if HAVE_GETOPT_LONG
static struct option long_options[] =
{
{"alt", required_argument, 0, 'a'},
{"var", required_argument, 0, 'v'},
{"var-left", required_argument, 0, 'l'},
{"var-right", required_argument, 0, 'r'},
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'V'},
{0, 0, 0, 0}
};

int cnt=getopt_long(argc, argv, "a:v:l:r:hV", long_options, &option_index);
#else
int cnt=getopt(argc, argv, "a:v:l:r:hV");
#endif
if (cnt==-1)
break;

switch (cnt)
{
case 'a':
c.setAltValue(optarg);
break;

case 'v':
c.setVariantValue(optarg);
break;

case 'l':
vl = optarg;
c.setVariantLeftValue(vl);
break;

case 'r':
vr = optarg;
c.setVariantRightValue(vr);
break;

case 'V':
c.setVerbose(true);
break;

case 'h':
default:
endProgram(argv[0]);
break;
}
}

string opc;
string infile;
string outfile;
string acxfile;

switch(argc - optind + 1)
{
case 5:
opc = argv[argc-4];
infile = argv[argc-3];
outfile = argv[argc-2];
acxfile = argv[argc-1];
break;

case 4:
opc = argv[argc-3];
infile = argv[argc-2];
outfile = argv[argc-1];
break;

default:
endProgram(argv[0]);
break;
}

xmlTextReaderPtr reader;
reader = xmlReaderForFile(infile.c_str(), NULL, 0);
xmlGenericErrorFunc handler = (xmlGenericErrorFunc)errorFunc;
initGenericErrorDefaultFunc(&handler);
if(reader != NULL)
{
int ret = xmlTextReaderRead(reader);
if(ret != 1)
{
ttype = 'a';
}
xmlFreeTextReader(reader);
xmlCleanupParser();
}
else
{
wcerr << "Error: Cannot not open file '" << infile << "'." << endl << endl;
exit(EXIT_FAILURE);
}
initGenericErrorDefaultFunc(NULL);


if(opc == "lr")
{
if(vr == "" && vl != "")
{
cout << "Error: -l specified, but mode is lr" << endl;
endProgram(argv[0]);
}
if(ttype == 'a')
{
#if defined __clang__
locale::global(locale(""));;
#elif defined __APPLE__
LtLocale::tryToSetLocale();
#else
locale::global(locale(""));;
#endif
a.parse(infile, Compiler::COMPILER_RESTRICTION_LR_VAL);
}
else
{
LtLocale::tryToSetLocale();
if(acxfile != "")
{
c.parseACX(acxfile, Compiler::COMPILER_RESTRICTION_LR_VAL);
}
c.parse(infile, Compiler::COMPILER_RESTRICTION_LR_VAL);
}
}
else if(opc == "rl")
{
if(vl == "" && vr != "")
{
cout << "Error: -r specified, but mode is rl" << endl;
endProgram(argv[0]);
}
if(ttype == 'a')
{
#if defined __clang__
locale::global(locale(""));;
#elif defined __APPLE__
LtLocale::tryToSetLocale();
#else
locale::global(locale(""));;
#endif
a.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL);
}
else
{
LtLocale::tryToSetLocale();
c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL);
}
}
else
{
endProgram(argv[0]);
}

FILE *output = fopen(outfile.c_str(), "wb");
if(!output)
{
wcerr << "Error: Cannot open file '" << outfile << "'." << endl;
exit(EXIT_FAILURE);
}
if(ttype == 'a')
{
a.write(output);
}
else
{
c.write(output);
}
fclose(output);
}

0 comments on commit 444c595

Please sign in to comment.