forked from apertium/lttoolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
lt-reweight: assign weights to a compiled transducer based on a corpus
Write the utility to process tagged corpus and the binary lttoolbox file and return weighted analyses. Closes apertium#16
- Loading branch information
1 parent
1b44b8a
commit 847e2cc
Showing
6 changed files
with
319 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
.TH lt-reweight 1 2014-02-07 "" "" | ||
.SH NAME | ||
lt-reweight \- This application is part of the lexical processing modules | ||
and tools ( | ||
.B lttoolbox | ||
) | ||
.PP | ||
This tool is part of the apertium machine translation | ||
architecture: \fBhttp://www.apertium.org\fR. | ||
.SH SYNOPSIS | ||
.B lt-trim | ||
analyser_binary bidix_binary trimmed_analyser_binary | ||
.PP | ||
.SH DESCRIPTION | ||
.BR lt-trim | ||
is the application responsible for trimming compiled dictionaries. The | ||
analyses (right-side when compiling lr) of analyser_binary are trimmed | ||
to the input side of bidix_binary (left-side when compiling lr, | ||
right-side when compiling rl), such that only analyses which would | ||
pass through `lt-proc \-b bidix_binary' are kept. | ||
|
||
\fBWarning: this program is experimental!\fR It has been tested, but | ||
not deployed extensively yet. | ||
|
||
Both compund tags (`<compound-only-L>', `<compound-R>') and join | ||
elements (`<j/>' in XML, `+' in the stream) and the group element | ||
(`<g/>' in XML, `#' in the stream) should be handled correctly, even | ||
combinations of + followed by # in monodix are handled. | ||
|
||
Some minor caveats: If you have the capitalised lemma "Foo" in the | ||
monodix, but "foo" in the bidix, an analysis "^Foo<tag>$" would pass | ||
through bidix when doing lt-proc \-b, but will not make it through | ||
trimming. Make sure your lemmas have the same capitalisation in the | ||
different dictionaries. Also, you should not have literal `+' or `#' | ||
in your lemmas. Since lt-comp doesn't escape these, lt-trim cannot | ||
know that they are different from `<j/>' or `<g/>', and you may get | ||
@-marked output this way. You can analyse `+' or `#' by having the | ||
literal symbol in the `<l>' part and some other string (e.g. "plus") | ||
in the `<r>'. | ||
|
||
You should not trim a generator unless you have a \fBvery\fR simple | ||
translator pipeline, since the output of bidix seldom goes unchanged | ||
through transfer. | ||
.PP | ||
.SH FILES | ||
.B analyser_binary | ||
The untrimmed analyser dictionary (a finite state transducer). | ||
.PP | ||
.B bidix_binary | ||
The dictionary to use as trimmer (a finite state transducer). | ||
.PP | ||
.B trimmed_analyser_binary | ||
The trimmed analyser dictionary (a finite state transducer). | ||
|
||
.SH SEE ALSO | ||
.I lt-comp\fR(1), | ||
.I lt-proc\fR(1), | ||
.I lt-print\fR(1), | ||
.I lt-expand\fR(1), | ||
.I lt-trim\fR(1), | ||
.I apertium-tagger\fR(1), | ||
.I apertium\fR(1). | ||
.SH BUGS | ||
Lots of...lurking in the dark and waiting for you! | ||
.SH AUTHOR | ||
(c) 2013--2014 Universitat d'Alacant / Universidad de Alicante. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
/* | ||
* Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante | ||
* | ||
* This program is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU General Public License as | ||
* published by the Free Software Foundation; either version 2 of the | ||
* License, or (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, but | ||
* WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
#include <lttoolbox/transducer.h> | ||
#include <lttoolbox/compression.h> | ||
#include <lttoolbox/lttoolbox_config.h> | ||
|
||
#include <lttoolbox/my_stdio.h> | ||
#include <lttoolbox/lt_locale.h> | ||
|
||
#include <cstdlib> | ||
#include <iostream> | ||
#include <libgen.h> | ||
#include <string> | ||
|
||
void endProgram(char *name) | ||
{ | ||
if(name != NULL) | ||
{ | ||
cout << basename(name) << " v" << PACKAGE_VERSION <<": assign weights to a compiled transducer based on a corpus." << endl; | ||
cout << "USAGE: " << basename(name) << " analyser_bin_file tagged_corpus" << endl; | ||
} | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > | ||
read_fst(FILE *bin_file) | ||
{ | ||
Alphabet new_alphabet; | ||
wstring letters = L""; | ||
|
||
std::map<wstring, Transducer> transducers; | ||
|
||
// letters | ||
int len = Compression::multibyte_read(bin_file); | ||
while(len > 0) | ||
{ | ||
letters.push_back(static_cast<wchar_t>(Compression::multibyte_read(bin_file))); | ||
len--; | ||
} | ||
|
||
// symbols | ||
new_alphabet.read(bin_file); | ||
|
||
len = Compression::multibyte_read(bin_file); | ||
|
||
while(len > 0) | ||
{ | ||
int len2 = Compression::multibyte_read(bin_file); | ||
wstring name = L""; | ||
while(len2 > 0) | ||
{ | ||
name += static_cast<wchar_t>(Compression::multibyte_read(bin_file)); | ||
len2--; | ||
} | ||
transducers[name].read(bin_file); | ||
|
||
len--; | ||
} | ||
|
||
std::pair<Alphabet, wstring> alph_letters; | ||
alph_letters.first = new_alphabet; | ||
alph_letters.second = letters; | ||
return std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > (alph_letters, transducers); | ||
} | ||
|
||
std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > | ||
trim(FILE *file_mono, FILE *file_bi) | ||
{ | ||
std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > alph_trans_mono = read_fst(file_mono); | ||
Alphabet alph_mono = alph_trans_mono.first.first; | ||
std::map<wstring, Transducer> trans_mono = alph_trans_mono.second; | ||
std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > alph_trans_bi = read_fst(file_bi); | ||
Alphabet alph_bi = alph_trans_bi.first.first; | ||
std::map<wstring, Transducer> trans_bi = alph_trans_bi.second; | ||
|
||
// The prefix transducer is the union of all transducers from bidix, | ||
// with a ".*" appended | ||
Transducer union_transducer; | ||
// The "." in ".*" is a set of equal pairs of the output symbols | ||
// from the monodix alphabet (<n>:<n> etc.) | ||
Alphabet alph_prefix = alph_bi; | ||
set<int> loopback_symbols; // ints refer to alph_prefix | ||
alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); | ||
|
||
for(std::map<wstring, Transducer>::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) | ||
{ | ||
Transducer union_tmp = it->second; | ||
if(union_transducer.isEmpty()) | ||
{ | ||
union_transducer = union_tmp; | ||
} | ||
else | ||
{ | ||
union_transducer.unionWith(alph_bi, union_tmp); | ||
} | ||
} | ||
union_transducer.minimize(); | ||
|
||
Transducer prefix_transducer = union_transducer.appendDotStar(loopback_symbols); | ||
// prefix_transducer should _not_ be minimized (both useless and takes forever) | ||
Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); | ||
|
||
|
||
for(std::map<wstring, Transducer>::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) | ||
{ | ||
Transducer trimmed = it->second.intersect(moved_transducer, | ||
alph_mono, | ||
alph_prefix); | ||
|
||
wcout << it->first << " " << it->second.size(); | ||
wcout << " " << it->second.numberOfTransitions() << endl; | ||
if(it->second.numberOfTransitions() == 0) | ||
{ | ||
wcerr << L"Warning: empty section! Skipping it ..."<<endl; | ||
trans_mono[it->first].clear(); | ||
} | ||
else if(trimmed.hasNoFinals()) { | ||
wcerr << L"Warning: section had no final state after trimming! Skipping it ..."<<endl; | ||
trans_mono[it->first].clear(); | ||
} | ||
else { | ||
trimmed.minimize(); | ||
trans_mono[it->first] = trimmed; | ||
} | ||
} | ||
|
||
alph_trans_mono.second = trans_mono; | ||
return alph_trans_mono; | ||
} | ||
|
||
|
||
int main(int argc, char *argv[]) | ||
{ | ||
if(argc != 3) | ||
{ | ||
endProgram(argv[0]); | ||
} | ||
|
||
LtLocale::tryToSetLocale(); | ||
|
||
FILE *analyser = fopen(argv[1], "rb"); | ||
if(!analyser) | ||
{ | ||
wcerr << "Error: Cannot not open file '" << argv[1] << "'." << endl << endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
FILE *bidix = fopen(argv[2], "rb"); | ||
if(!bidix) | ||
{ | ||
wcerr << "Error: Cannot not open file '" << argv[2] << "'." << endl << endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
std::pair<std::pair<Alphabet, wstring>, std::map<wstring, Transducer> > trimmed = trim(analyser, bidix); | ||
Alphabet alph_t = trimmed.first.first; | ||
wstring letters = trimmed.first.second; | ||
std::map<wstring, Transducer> trans_t = trimmed.second; | ||
|
||
int n_transducers = 0; | ||
for(std::map<wstring, Transducer>::iterator it = trans_t.begin(); it != trans_t.end(); it++) | ||
{ | ||
if(!(it->second.isEmpty())) | ||
{ | ||
n_transducers++; | ||
} | ||
} | ||
|
||
if(n_transducers == 0) | ||
{ | ||
wcerr << L"Error: Trimming gave empty transducer!" << endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
// Write the file: | ||
FILE *output = fopen(argv[3], "wb"); | ||
if(!output) | ||
{ | ||
wcerr << "Error: Cannot not open file '" << argv[3] << "'." << endl << endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
// letters | ||
Compression::wstring_write(letters, output); | ||
|
||
// symbols | ||
alph_t.write(output); | ||
|
||
// transducers | ||
Compression::multibyte_write(n_transducers, output); | ||
for(std::map<wstring, Transducer>::iterator it = trans_t.begin(); it != trans_t.end(); it++) | ||
{ | ||
if(!(it->second.isEmpty())) | ||
{ | ||
Compression::wstring_write(it->first, output); | ||
it->second.write(output); | ||
} | ||
} | ||
|
||
fclose(analyser); | ||
fclose(bidix); | ||
fclose(output); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.