diff --git a/lttoolbox/compression.cc b/lttoolbox/compression.cc index 961438e4..144f3f78 100644 --- a/lttoolbox/compression.cc +++ b/lttoolbox/compression.cc @@ -307,8 +307,8 @@ Compression::long_multibyte_write(const double& value, FILE *output) { int exp = 0; - unsigned int mantissa = static_cast(0x40000000 * frexp(value, &exp)); - unsigned int exponent = static_cast(exp); + unsigned int mantissa = static_cast(static_cast(0x40000000 * frexp(value, &exp))); + unsigned int exponent = static_cast(static_cast(exp)); if(mantissa < 0x04000000) { @@ -344,8 +344,8 @@ Compression::long_multibyte_write(const double& value, ostream &output) { int exp = 0; - unsigned int mantissa = static_cast(0x40000000 * frexp(value, &exp)); - unsigned int exponent = static_cast(exp); + unsigned int mantissa = static_cast(static_cast(0x40000000 * frexp(value, &exp))); + unsigned int exponent = static_cast(static_cast(exp)); if(mantissa < 0x04000000) { diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 765a4363..fc064613 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -882,14 +882,11 @@ FSTProcessor::lsx(FILE *input, FILE *output) alive_states.push_back(initial_state); - while(!feof(input)) - { - int val = fgetwc_unlocked(input); + int val = -1; - if (val == 0) { - blankqueue.push(blank); - break; - } + while(!feof(input) && val != 0) + { + val = fgetwc_unlocked(input); if(val == L'+' && isEscaped(val) && !outOfWord) { @@ -897,7 +894,7 @@ FSTProcessor::lsx(FILE *input, FILE *output) plus_thing = true; } - if((val == L'^' && isEscaped(val) && outOfWord) || feof(input)) + if((val == L'^' && isEscaped(val) && outOfWord) || feof(input) || val == 0) { blankqueue.push(blank); @@ -940,7 +937,7 @@ FSTProcessor::lsx(FILE *input, FILE *output) continue; } - //wcerr << L"\n[!] " << (wchar_t)val << L" ||| " << outOfWord << endl; + // wcerr << L"\n[!] " << (wchar_t)val << L" ||| " << outOfWord << endl; if(outOfWord) { @@ -948,7 +945,7 @@ FSTProcessor::lsx(FILE *input, FILE *output) continue; } - if((feof(input) || val == L'$') && !outOfWord) // && isEscaped(val) + if((val == 0 || feof(input) || val == L'$') && !outOfWord) // && isEscaped(val) { new_states.clear(); for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) @@ -2150,11 +2147,14 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) } else { - if (isEscaped(val)) + if(val != L'\0') { - fputwc_unlocked(L'\\', output); + if (isEscaped(val)) + { + fputwc_unlocked(L'\\', output); + } + fputwc_unlocked(val, output); } - fputwc_unlocked(val, output); } } else @@ -2166,56 +2166,9 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) bool uppercase = source.size() > 1 && firstupper && iswupper(source[2]); target = current_state.filterFinals(all_finals, alphabet, empty_escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); - // case of the beggining of the next word - - wstring mybuf = L""; - for (size_t i = source.size(); i > 0; --i) - { - if (!isalpha(source[i - 1])) - { - break; - } - else - { - mybuf = source[i - 1] + mybuf; - } - } - - if (mybuf.size() > 0) - { - bool myfirstupper = iswupper(mybuf[0]); - bool myuppercase = mybuf.size() > 1 && iswupper(mybuf[1]); - - for (size_t i = target.size(); i > 0; --i) - { - if (!isalpha(target[i - 1])) - { - if (myfirstupper && i != target.size()) - { - target[i] = towupper(target[i]); - } - else - { - target[i] = towlower(target[i]); - } - break; - } - else - { - if (myuppercase) - { - target[i - 1] = towupper(target[i - 1]); - } - else - { - target[i - 1] = towlower(target[i - 1]); - } - } - } - } - last = input_buffer.getPos(); } diff --git a/lttoolbox/lt-comp.1 b/lttoolbox/lt-comp.1 index 8a2be8e6..df8b9f30 100644 --- a/lttoolbox/lt-comp.1 +++ b/lttoolbox/lt-comp.1 @@ -1,97 +1,87 @@ -.TH lt-comp 1 2006-03-08 "" "" -.SH NAME -lt-comp \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://www.apertium.org\fR. -.SH SYNOPSIS -.B lt-comp -[ -.B \-a \fR| -.B \-v \fR| -.B \-l \fR| -.B \-r \fR| -.B \-h -] -[ -.B lr \fR| -.B rl -] dictionary_file output_file [acx_file] -.PP -.B lt-comp -[ -.B \-\-alt \fR| -.B \-\-var \fR| -.B \-\-var\-left \fR| -.B \-\-var\-right \fR| -.B \-\-help -] -[ -.B lr \fR| -.B rl -] dictionary_file output_file [acx_file] -.PP -.SH DESCRIPTION -.BR lt-comp -Is the application responsible of compiling dictionaries used by -\fBlt-proc\fR in \fIApertium\fR into a compact and efficient -representation (a class of finite-state transducers called augmented -letter transducers). -.PP -.SH OPTIONS -.TP -.B \-a, \-\-alt -Sets the value of the \fIalt\fR attribute to use in compilation. - +.Dd March 8, 2006 +.Dt LT-COMP 1 +.Os Apertium +.Sh NAME +.Nm lt-comp +.Nd augmented letter transducer compiler for Apertium +.Sh SYNOPSIS +.Nm lt-comp +.Op Fl a | v | l | r | h +.Cm lr | rl +.Ar dictionary_file +.Ar output_file +.Op Ar acx_file +.Sh DESCRIPTION +.Nm lt-comp +is the application responsible for compiling dictionaries used by +.Xr lt-proc 1 +in Apertium into a compact and efficient representation +(a class of finite-state transducers called augmented letter transducers). +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a , Fl Fl alt +Sets the value of the +.Sy alt +attribute to use in compilation. +.Pp Note that if no value is set, all entries containing an \fIalt\fR attribute are omitted. -.TP -.B \-v, \-\-var -Sets the value of the \fIv\fR attribute to use in compilation. -This should only be used with monodixes; for bidixes, see \-l and \-r. - -Note that if no value is set, all entries containing a \fIv\fR -attribute are considered to be \fIleft-to-right\fR. -.TP -.B \-l, \-\-var\-left -Sets the value of the \fIvl\fR attribute for use in compilation of bidixes. -"Left" here refers to the side of the dictionary, so this option is only valid -in \fIrl\fR mode. -.TP -.B \-r, \-\-var\-right -Sets the value of the \fIvr\fR attribute for use in compilation of bidixes. -"Right" here refers to the side of the dictionary, so this option is only valid -in \fIlr\fR mode. -.TP -.B \-h, \-\-help -Prints a short help message -.TP -.B lr +.It Fl v , Fl Fl var +Sets the value of the +.Sy v +attribute to use in compilation. +This should only be used with monodixes; for bidixes, see +.Fl l +and +.Fl r . +.Pp +Note that if no value is set, all entries containing a +.Sy v +attribute are considered to be +.Em left-to-right . +.It Fl l , Fl Fl var-left +Sets the value of the +.Sy vl +attribute for use in compilation of bidixes. +.Dq Left +here refers to the side of the dictionary, so this option is only valid in +.Cm rl +mode. +.It Fl r , Fl Fl var-right +Sets the value of the +.Sy vr +attribute for use in compilation of bidixes. +.Dq Right +here refers to the side of the dictionary, so this option is only valid in +.Cm lr +mode. +.It Fl h , Fl Fl help +Prints a short help message. +.It Cm lr The resulting transducer will process dictionary entries -\fIleft-to-right\fR. -.TP -.B rl +.Em left-to-right . +.It Cm rl The resulting transducer will process dictionary entries -\fIright-to-left\fR. -.SH FILES -.B dictionary_file +.Em right-to-left . +.El +.Sh FILES +.Bl -tag -width Ds +.It Ar dictionary_file The input dictionary. -.PP -.B output_file +.It Ar output_file The compiled dictionary (a finite state transducer). -.PP -.B acx_file +.It Ar acx_file Optional XML file of equivalent characters in monodices. - -.SH SEE ALSO -.I lt-proc\fR(1), -.I lt-expand\fR(1), -.I apertium-tagger\fR(1), -.I apertium\fR(1). -.SH BUGS -Lots of...lurking in the dark and waiting for you! -.SH AUTHOR -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-expand 1 , +.Xr lt-proc 1 +.Sh COPYRIGHT +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt-expand.1 b/lttoolbox/lt-expand.1 index 35ee440f..6d32e5b3 100644 --- a/lttoolbox/lt-expand.1 +++ b/lttoolbox/lt-expand.1 @@ -1,71 +1,60 @@ -.TH lt-expand 1 2006-03-08 "" "" -.SH NAME -lt-expand \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://www.apertium.org\fR. -.SH SYNOPSIS -.B lt-expand -[ -.B \-a \fR| -.B \-v \fR| -.B \-l \fR| -.B \-r \fR| -.B \-h -] -dictionary_file [output_file] -.PP -.B lt-expand -[ -.B \-\-alt \fR| -.B \-\-var \fR| -.B \-\-var\-left \fR| -.B \-\-var\-right \fR| -.B \-\-help -] -dictionary_file [output_file] -.PP -.SH DESCRIPTION -.BR lt-expand -Is the application responsible for expanding a dictionary into a -simple list of input string-output string pairs by eliminating -paradigms through substitution and unfolding. -.PP -The output goes to \fIoutput_file\fR if it is present or to standard -output if it is missing. -.PP -.SH OPTIONS -.TP -.B \-a, \-\-alt -Sets the value of the \fIalt\fR attribute to use in expansion -.TP -.B \-v, \-\-var -Sets the value of the \fIv\fR attribute to use in expansion of monodixes -.TP -.B \-l, \-\-var\-left -Sets the value of the \fIvl\fR attribute to use in expansion of bidixes -.TP -.B \-r, \-\-var\-right -Sets the value of the \fIvr\fR attribute to use in expansion of bidixes -.TP -.B \-h, \-\-help +.Dd March 8, 2006 +.Dt LT-EXPAND 1 +.Os Apertium +.Sh NAME +.Nm lt-expand +.Nd dictionary expander for Apertium +.Sh SYNOPSIS +.Nm lt-expand +.Op Fl a | v | l | r | h +.Ar dictionary_file +.Op Ar output_file +.Sh DESCRIPTION +.Nm lt-expand +is the application responsible for expanding a dictionary +into a simple list of input string-output string pairs +by eliminating paradigms through substitution and unfolding. +.Pp +The output goes to +.Ar output_file +if it is present or to standard output if it is missing. +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a , Fl Fl alt +Sets the value of the +.Sy alt +attribute to use in expansion +.It Fl v , Fl Fl var +Sets the value of the +.Sy v +attribute to use in expansion of monodixes +.It Fl l , Fl Fl var-left +Sets the value of the +.Sy vl +attribute to use in expansion of bidixes +.It Fl r , Fl Fl var-right +Sets the value of the +.Sy vr +attribute to use in expansion of bidixes +.It Fl h , Fl Fl help Prints a short help message -.PP -.SH FILES -.B dictionary_file +.El +.Sh FILES +.Bl -tag -width Ds +.It Ar dictionary_file The input dictionary to expand. -.PP -.B output_file +.It Ar output_file Text containing the expanded dictionary information. -.SH SEE ALSO -.I lt-proc\fR(1), -.I lt-comp\fR(1), -.I apertium-tagger\fR(1), -.I apertium\fR(1). -.SH BUGS -Lots of...lurking in the dark and waiting for you! -.SH AUTHOR -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-comp 1 , +.Xr lt-proc 1 +.Sh COPYRIGHT +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt-print.1 b/lttoolbox/lt-print.1 index 0486b8a5..fc34c944 100644 --- a/lttoolbox/lt-print.1 +++ b/lttoolbox/lt-print.1 @@ -1,34 +1,32 @@ -.TH lt-print 1 2006-03-08 "" "" -.SH NAME -lt-print \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://www.apertium.org\fR. -.SH SYNOPSIS -.B lt-print - bin_file -.PP -.SH DESCRIPTION -.BR lt-print -Is the application responsible for printing compiled dictionaries in -ATT format. -.PP -.B bin_file -The compiled input file . -.PP -.B output_file -The transducer in ATT format . - -.SH SEE ALSO -.I lt-comp\fR(1), -.I lt-proc\fR(1), -.I lt-expand\fR(1), -.I apertium-tagger\fR(1), -.I apertium\fR(1). -.SH BUGS +.Dd March 8, 2006 +.Dt LT-PRINT 1 +.Os Apertium +.Sh NAME +.Nm lt-print +.Nd compiled dictionary printer for Apertium +.Sh SYNOPSIS +.Nm lt-print +.Ar bin_file +.Op Ar output_file +.Sh DESCRIPTION +.Nm lt-print +is the application responsible for printing compiled dictionaries in ATT format. +.Bl -tag -width Ds +.It Ar bin_file +The compiled input file. +.It Ar output_file +The transducer in ATT format. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-comp 1 , +.Xr lt-expand 1 , +.Xr lt-proc 1 +.Sh COPYRIGHT +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS Currently requires a UTF-8 locale (and doesn't crash if it doesn't have one). -.SH AUTHOR -(c) 2005--2016 Universitat d'Alacant / Universidad de Alicante. diff --git a/lttoolbox/lt-proc.1 b/lttoolbox/lt-proc.1 index 2d847e06..8972f2e2 100644 --- a/lttoolbox/lt-proc.1 +++ b/lttoolbox/lt-proc.1 @@ -1,178 +1,180 @@ -.TH lt-proc 1 2006-03-23 "" "" -.SH NAME -lt-proc \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://www.apertium.org\fR. -.SH SYNOPSIS -.B lt-proc -[ -.B \-a \fR| -.B \-b \fR| -.B \-o \fR| -.B \-c \fR| -.B \-d \fR| -.B \-e \fR| -.B \-g \fR| -.B \-n \fR| -.B \-p \fR| -.B \-s \fR| -.B \-t \fR| -.B \-v \fR| -.B \-h -.B \-z -.B \-w -] [ -.B \-i \fR icx_file -] fst_file [input_file [output_file]] -.PP -.B lt-proc -[ -.B \-\-analysis \fR| -.B \-\-bilingual \fR| -.B \-\-surf\-bilingual \fR| -.B \-\-case\-sensitive \fR| -.B \-\-debugged\-gen \fR| -.B \-\-decompose\-nouns \fR| -.B \-\-generation \fR| -.B \-\-non\-marked\-gen \fR| -.B \-\-tagged\-gen \fR| -.B \-\-post\-generation \fR| -.B \-\-sao \fR| -.B \-\-transliteration \fR| -.B \-\-null\-flush -.B \-\-dictionary\-case -.B \-\-decompose\-compounds \fR| -.B \-\-version \fR| -.B \-\-help -] [ -.B \-\-ignored\-chars \fR icx_file -] fst_file [input_file [output_file]] -.SH DESCRIPTION -.BR lt-proc +.Dd March 23, 2006 +.Dt LT-PROC 1 +.Os Apertium +.Sh NAME +.Nm lt-proc +.Nd lexical processor for Apertium +.Sh SYNOPSIS +.Nm lt-proc +.Op Fl a | b | o | c | d | e | g | h | p | s | t | v | h | z | w +.Op Fl W +.Op Fl N N +.Op Fl L N +.Op Fl i Ar icx_file +.Ar fst_file +.Op Ar input_file Op Ar output_file +.Sh DESCRIPTION +.Nm lt-proc is the application responsible for providing the four lexical -processing functionalities - -.RS -\(bu \fImorphological analyser\fR ( option \fB\-a\fR ) -.PP -\(bu \fIlexical transfer\fR ( option \fB\-n\fR ) -.PP -\(bu \fImorphological generator\fR ( option \fB\-g\fR ) -.PP -\(bu \fIpost-generator\fR ( option \fB\-p\fR ) -.RE -\fR -.PP +processing functionalities: +.Bl -bullet +.It +morphological analyser +.Pq option Fl a +.It +lexical transfer +.Pq option Fl n +.It +morphological generator +.Pq option Fl g +.It +post-generator +.Pq option Fl p +.El +.Pp It accomplishes these tasks by reading binary files containing a compact and efficient representation of dictionaries (a class of -finite-state transducers called augmented letter transducers). These -files are generated by \fBlt\-comp(1)\fR. -.PP -It is worth to mention that some characters -(`\fB[\fR', `\fB]\fR', `\fB$\fR', `\fB^\fR', `\fB/\fR', `\fB+\fR') are -\fIspecial\fR chars used for format and encapsulation. They should be -escaped if they have to be used literally, for -instance: `\fB[\fR'...`\fB]\fR' are ignored and the format of a -\fIlinefeed\fR is `\fB^\fR...\fB$\fR'. -.SH OPTIONS -.TP -.B \-a, \-\-analysis -Tokenizes the text in surface forms (lexical units as they appear in -texts) and delivers, for each surface form, one or more lexical forms -consisting of lemma, lexical category and morphological inflection -information. Tokenization is not straightforward due to the existence, -on the one hand, of contractions, and, on the other hand, of -multi-word lexical units. For contractions, the system reads in a -single surface form and delivers the corresponding sequence of lexical -forms. Multi-word surface forms are analysed in a left-to-right, -longest-match fashion. Multi-word surface forms may be invariable +finite-state transducers called augmented letter transducers). +These files are generated by +.Xr lt-comp 1 . +.Pp +It is worth mentioning that some characters +.Po +.Ql \&[ , +.Ql \&] , +.Ql $ , +.Ql \(a^ , +.Ql / , +.Ql + +.Pc +are +.Em special +chars used for format and encapsulation. +They should be escaped if they have to be used literally, for instance: +.So \&[ Sc Ns Ar ... Ns So \&] Sc +are ignored and the format of a +.Em linefeed +is +.So \(a^ Ns Ar ... Ns $ Sc . +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl a , Fl Fl analysis +Tokenizes the text in surface forms (lexical units as they appear in texts) +and delivers, for each surface form, one or more lexical forms +consisting of lemma, lexical category and morphological inflection information. +Tokenization is not straightforward due to the existence, on the one hand, +of contractions, and, on the other hand, of multi-word lexical units. +For contractions, the system reads in a single surface form +and delivers the corresponding sequence of lexical forms. +Multi-word surface forms are analysed in a left-to-right, +longest-match fashion. +Multi-word surface forms may be invariable (such as a multi-word preposition or conjunction) or inflected (for -example, in es, \fI"echaban de menos"\fR, \(dqthey missed\(dq, is a -form of the imperfect indicative tense of the verb \fI"echar de -menos"\fR, \(dqto miss\(dq). Limited support for some kinds of -discontinuous multi-word units is also available. Single-word surface -forms analysis produces output like the one in these examples: -\ \fI"cantar"\fR \-> `\fI^cantar/cantar$\fR' or -\ `\fI"daba"\fR \-> -\ `\fI^daba/dar/dar$\fR'. -.TP -.B \-b, \-\-bilingual +example, in es, +.Dq echaban de menos , +.Dq they missed , +is a form of the imperfect indicative tense of the verb +.Dq echar de menos , +.Dq to miss ) . +Limited support for some kinds of discontinuous multi-word units +is also available. +Single-word surface forms analysis produces output +like the one in these examples: +.Pp +.Dq cantar +\(-> +.Dq \(a^cantar/cantar$ +or +.Dq daba +\(-> +.Dq \(a^daba/dar/dar$ . +.It Fl b , Fl Fl bilingual Does lexical transference, attaching queues of morphological symbols -not specified in the dictionaries. As the analysis mode, supports -multiple lexical forms in the target language for a given lexical -form in the source language. Works typically with the output of -apertium-pretransfer. -.TP -.B \-o, \-\-surf\-bilingual -As with \-b, but takes input from apertium\-tagger \-p , with -surface forms, and if the lexical form is not found in the bilingual +not specified in the dictionaries. +As the analysis mode, supports multiple lexical forms in the target language +for a given lexical form in the source language. +Works typically with the output of +.Xr apertium-pretransfer 1 . +.It Fl o , Fl Fl surf-bilingual +As with +.Fl b , +but takes input from +.Xr apertium-tagger 1 +.Fl p , +with surface forms, and if the lexical form is not found in the bilingual dictionary, it outputs the surface form of the word. -.TP - -.B \-c, \-\-case\-sensitive +.It Fl c , Fl Fl case-sensitive Use the literal case of the incoming characters -.TP -.B \-d, \-\-debugged\-gen -Morph. generation with all the stuff -.TP -.B \-e, \-\-decompose\-compounds +.It Fl d , Fl Fl debugged-gen +Morphological generation with all the stuff +.It Fl e , Fl Fl decompose-compounds Try to treat unknown words as compounds, and decompose them. -.TP -.B \-w, \-\-dictionary\-case +.It Fl w , Fl Fl dictionary-case Use the case information contained in the lexicon, instead of the surface case (only applied in analysis mode). -.TP -.B \-g, \-\-generation -Delivers a target-language surface form for each target-language -lexical form, by suitably inflecting it. -.TP -.B \-n, \-\-non\-marked\-gen -Morphological generation (like \fB\-g\fR) but without unknown word -marks (asterisk `*'). -.TP -.B \-b, \-\-tagged\-gen -Morphological generation (like \fB\-g\fR) but retaining part-of-speech -tags. -.TP -.B \-p, \-\-post\-generation -Performs orthographical operations such as contractions and -apostrophations. The post-generator is usually \fIdormant\fR (just -copies the input to the output) until a special \fIalarm\fR symbol -contained in some target-language surface forms \fIwakes\fR it up to -perform a particular string transformation if necessary; then it goes -back to sleep. -.TP -.B \-s, \-\-sao -Input processing is in \fIorthoepikon\fR (previously `\fIsao\fR') -annotation system format: \fBhttp://orthoepikon.sf.net\fR. -.TP -.B \-t, \-\-transliteration +.It Fl g , Fl Fl generation +Delivers a target-language surface form for each target-language lexical form, +by suitably inflecting it. +.It Fl n , Fl Fl non-marked-gen +Morphological generation (like +.Fl g ) +but without unknown word marks (asterisk +.Ql * ) . +.It Fl b , Fl Fl tagged-gen +Morphological generation (like +.Fl g ) +but retaining part-of-speech tags. +.It Fl p , Fl Fl post-generation +Performs orthographical operations such as contractions and apostrophations. +The post-generator is usually +.Em dormant +(just copies the input to the output) until a special +.Em alarm +symbol contained in some target-language surface forms +.Em wakes +it up to perform a particular string transformation if necessary; +then it goes back to sleep. +.It Fl s , Fl Fl sao +Input processing is in +.Em orthoepikon +(previously +.Em sao ) +annotation system format: +.Lk http://orthoepikon.sf.net . +.It Fl t , Fl Fl transliteration Apply a transliteration dictionary -.TP -.B \-i, \-\-ignored\-chars icx_file -Ignores characters specified in the file icx_file -.TP -.B \-z, \-\-null\-flush +.It Fl i Ar icx_file , Fl Fl ignored-chars Ar icx_file +Ignores characters specified in the file +.Ar icx_file +.It Fl z , Fl Fl null-flush Flush output on the null character -.TP -.B \-v, \-\-version +.It Fl C , Fl Fl careful-case +Use dictionary case if present, else surface +.It Fl N , Fl Fl analyses +Output no more than N analyses (if the transducer is weighted, the N best analyses) +.It Fl L , Fl Fl weight-classes +Output no more than N best weight classes (where analyses with equal weight constitute a class) +.It Fl W , Fl Fl show-weights +Print final analysis weights (if any) +.It Fl v , Fl Fl version Display the version number. -.TP -.B \-h, \-\-help +.It Fl h , Fl Fl help Display this help. -.SH FILES -.B input_file +.El +.Sh FILES +.Bl -tag -width Ds +.It Ar input_file The input compiled dictionary. -.SH SEE ALSO -.I lt-expand\fR(1), -.I lt-comp\fR(1), -.I apertium-tagger\fR(1), -.I apertium\fR(1). -.SH BUGS -Lots of...lurking in the dark and waiting for you! -.SH AUTHOR -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-comp 1 , +.Xr lt-expand 1 +.Sh COPYRIGHT +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt-tmxcomp.1 b/lttoolbox/lt-tmxcomp.1 index c6049a53..056dbdc6 100644 --- a/lttoolbox/lt-tmxcomp.1 +++ b/lttoolbox/lt-tmxcomp.1 @@ -1,41 +1,42 @@ -.TH lt-comp 1 2006-03-08 "" "" -.SH NAME -lt-tmxcomp \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://apertium.org\fR. -.SH SYNOPSIS -.B lt-tmxcomp -lang1-lang2 tmx_file output_file -.PP -.SH DESCRIPTION -.BR lt-comp -Is the application responsible for compiling translation memories in -the TMX format used by \fBlt-tmxproc\fR in \fIApertium\fR into a compact -and efficient representation (a class of finite-state transducers called -augmented letter transducers). -.PP -.SH OPTIONS -.TP -.B lang1 +.Dd March 8, 2006 +.Dt LT-COMP 1 +.Os Apertium +.Sh NAME +.Nm lt-tmxcomp +.Nd translation memories compiler for Apertium +.Sh SYNOPSIS +.Nm lt-tmxcomp +.Ar lang1 Ns - Ns Ar lang2 +.Ar tmx_file +.Ar output_file +.Sh DESCRIPTION +.Nm lt-comp +is the application responsible for compiling translation memories in +the TMX format used by +.Xr lt-tmxproc 1 +in Apertium into a compact and efficient representation +(a class of finite-state transducers called augmented letter transducers). +.Sh OPTIONS +.Bl -tag -width Ds +.It Ar lang1 Input language -.TP -.B lang2 +.It Ar lang2 Output language -.SH FILES -.B tmx_file +.El +.Sh FILES +.Bl -tag -width Ds +.It Ar tmx_file The input translation memory, in TMX format. -.PP -.B output_file +.It Ar output_file The compiled translation memory (a finite state transducer). - -.SH SEE ALSO -.I lt-tmxproc\fR(1), -.I apertium\fR(1). -.SH BUGS -Lots of...lurking in the dark and waiting for you! -.SH AUTHOR -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr lt-tmxproc 1 +.Sh AUTHOR +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt-tmxproc.1 b/lttoolbox/lt-tmxproc.1 index 5cf1aa5a..6f21cd15 100644 --- a/lttoolbox/lt-tmxproc.1 +++ b/lttoolbox/lt-tmxproc.1 @@ -1,33 +1,35 @@ -.TH lt-proc 1 2006-03-23 "" "" -.SH NAME -lt-tmxproc \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://apertium.org\fR. -.SH SYNOPSIS -.B lt-tmxproc -fst_file [input_file [output_file]] -.PP -.SH DESCRIPTION -.BR lt-tmxproc +.Dd March 23, 2006 +.Dt LT-PROC 1 +.Os Apertium +.Sh NAME +.Nm lt-tmxproc +.Nd translation stream processor for Apertium +.Sh SYNOPSIS +.Nm lt-tmxproc +.Ar fst_file +.Op Ar input_file Op Ar output_file +.Sh DESCRIPTION +.Nm lt-tmxproc is the application responsible for preprocessing the translation stream in Apertium using a compiled translation memory. - -.PP +.Pp It accomplishes these tasks by reading binary files containing a compact and efficient representation of dictionaries (a class of -finite-state transducers called augmented letter transducers). These -files are generated by \fBlt\-tmxcomp(1)\fR. -.SH FILES -.B input_file +finite-state transducers called augmented letter transducers). +These files are generated by +.Xr lt-tmxcomp 1 . +.Sh FILES +.Bl -tag -width Ds +.It Ar input_file The input compiled dictionary. -.SH SEE ALSO -.I lt-tmxcomp\fR(1), -.I apertium\fR(1). -.SH BUGS -Lots of...lurking in the dark and waiting for you! -.SH AUTHOR -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr lt-tmxcomp 1 +.Sh AUTHOR +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt-trim.1 b/lttoolbox/lt-trim.1 index dc84bf8b..45ad3d48 100644 --- a/lttoolbox/lt-trim.1 +++ b/lttoolbox/lt-trim.1 @@ -1,65 +1,113 @@ -.TH lt-trim 1 2014-02-07 "" "" -.SH NAME -lt-trim \- This application is part of the lexical processing modules -and tools ( -.B lttoolbox -) -.PP -This tool is part of the apertium machine translation -architecture: \fBhttp://www.apertium.org\fR. -.SH SYNOPSIS -.B lt-trim -analyser_binary bidix_binary trimmed_analyser_binary -.PP -.SH DESCRIPTION -.BR lt-trim -is the application responsible for trimming compiled dictionaries. The -analyses (right-side when compiling lr) of analyser_binary are trimmed +.Dd February 7, 2014 +.Dt LT-TRIM 1 +.Os Apertium +.Sh NAME +.Nm lt-trim +.Nd compiled dictionary trimmer for Apertium +.Sh SYNOPSIS +.Nm lt-trim +.Ar analyser_binary +.Ar bidix_binary +.Ar trimmed_analyser_binary +.Sh DESCRIPTION +.Nm lt-trim +is the application responsible for trimming compiled dictionaries. +The analyses (right-side when compiling lr) of analyser_binary are trimmed to the input side of bidix_binary (left-side when compiling lr, right-side when compiling rl), such that only analyses which would -pass through `lt-proc \-b bidix_binary' are kept. - -\fBWarning: this program is experimental!\fR It has been tested, but -not deployed extensively yet. - -Both compund tags (`', `') and join -elements (`' in XML, `+' in the stream) and the group element -(`' in XML, `#' in the stream) should be handled correctly, even -combinations of + followed by # in monodix are handled. - -Some minor caveats: If you have the capitalised lemma "Foo" in the -monodix, but "foo" in the bidix, an analysis "^Foo$" would pass -through bidix when doing lt-proc \-b, but will not make it through -trimming. Make sure your lemmas have the same capitalisation in the -different dictionaries. Also, you should not have literal `+' or `#' -in your lemmas. Since lt-comp doesn't escape these, lt-trim cannot -know that they are different from `' or `', and you may get -@-marked output this way. You can analyse `+' or `#' by having the -literal symbol in the `' part and some other string (e.g. "plus") -in the `'. - -You should not trim a generator unless you have a \fBvery\fR simple -translator pipeline, since the output of bidix seldom goes unchanged -through transfer. -.PP -.SH FILES -.B analyser_binary +pass through +.So +.Xr lt-proc 1 +.Fl b Cm bidix_binary +.Sc +are kept. +.Pp +.Sy Warning: this program is experimental! +It has been tested, but not deployed extensively yet. +.Pp +Both compound tags +.Po +.Dq , +.Dq +.Pc +and join elements +.Po +.Dq +in XML, +.Dq + +in the stream +.Pc +and the group element +.Po +.Dq +in XML, +.Dq # +in the stream +.Pc +should be handled correctly, +even combinations of + followed by # in monodix are handled. +.Pp +Some minor caveats: If you have the capitalised lemma +.Dq Foo +in the monodix, but +.Dq foo +in the bidix, an analysis +.Dq \(a^Foo$ +would pass through bidix when doing +.Xr lt-proc 1 +.Fl b , +but will not make it through trimming. +Make sure your lemmas have the same capitalisation in the +different dictionaries. +Also, you should not have literal +.Ql + +or +.Ql # +in your lemmas. +Since +.Xr lt-comp 1 +doesn't escape these, +.Nm +cannot know that they are different from +.Dq +or +.Dq , +and you may get @-marked output this way. +You can analyse +.Ql + +or +.Ql # +by having the literal symbol in the +.Dq +part and some other string (e.g., +.Dq plus ) +in the +.Dq . +.Pp +You should not trim a generator unless you have a +.Em very +simple translator pipeline, +since the output of bidix seldom goes unchanged through transfer. +.Sh FILES +.Bl -tag -width Ds +.It Ar analyser_binary The untrimmed analyser dictionary (a finite state transducer). -.PP -.B bidix_binary +.It Ar bidix_binary The dictionary to use as trimmer (a finite state transducer). -.PP -.B trimmed_analyser_binary +.It Ar trimmed_analyser_binary The trimmed analyser dictionary (a finite state transducer). - -.SH SEE ALSO -.I lt-comp\fR(1), -.I lt-proc\fR(1), -.I lt-print\fR(1), -.I lt-expand\fR(1), -.I apertium-tagger\fR(1), -.I apertium\fR(1). -.SH BUGS -Lots of...lurking in the dark and waiting for you! -.SH AUTHOR -(c) 2013--2014 Universitat d'Alacant / Universidad de Alicante. +.El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr apertium-tagger 1 , +.Xr lt-comp 1 , +.Xr lt-expand 1 , +.Xr lt-print 1 , +.Xr lt-proc 1 +.Sh AUTHOR +Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt_proc.cc b/lttoolbox/lt_proc.cc index 26dd92f1..ba912c97 100644 --- a/lttoolbox/lt_proc.cc +++ b/lttoolbox/lt_proc.cc @@ -38,7 +38,7 @@ using namespace std; void endProgram(char *name) { cout << basename(name) << ": process a stream with a letter transducer" << endl; - cout << "USAGE: " << basename(name) << " [ -a | -b | -c | -d | -e | -g | -n | -p | -x | -s | -t | -v | -h -z -w ] [-W] [-N N] [-L N] [ -i icx_file ] [ -r rcx_file ] fst_file [input_file [output_file]]" << endl; + cout << "USAGE: " << basename(name) << " [ -a | -b | -c | -d | -e | -g | -n | -p | -x | -s | -t | -v | -h | -z | -w ] [-W] [-N N] [-L N] [ -i icx_file ] [ -r rcx_file ] fst_file [input_file [output_file]]" << endl; cout << "Options:" << endl; #if HAVE_GETOPT_LONG cout << " -a, --analysis: morphological analysis (default behavior)" << endl; @@ -249,21 +249,21 @@ int main(int argc, char *argv[]) FILE *in = fopen(argv[optind], "rb"); if(in == NULL || ferror(in)) { - wcerr << "Error: Cannot not open file '" << argv[optind] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; exit(EXIT_FAILURE); } input = fopen(argv[optind+1], "rb"); if(input == NULL || ferror(input)) { - wcerr << "Error: Cannot not open file '" << argv[optind+1] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; exit(EXIT_FAILURE); } output= fopen(argv[optind+2], "wb"); if(output == NULL || ferror(output)) { - wcerr << "Error: Cannot not open file '" << argv[optind+2] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[optind+2] << "'." << endl << endl; exit(EXIT_FAILURE); } @@ -275,14 +275,14 @@ int main(int argc, char *argv[]) FILE *in = fopen(argv[optind], "rb"); if(in == NULL || ferror(in)) { - wcerr << "Error: Cannot not open file '" << argv[optind] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; exit(EXIT_FAILURE); } input = fopen(argv[optind+1], "rb"); if(input == NULL || ferror(input)) { - wcerr << "Error: Cannot not open file '" << argv[optind+1] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; exit(EXIT_FAILURE); } @@ -294,7 +294,7 @@ int main(int argc, char *argv[]) FILE *in = fopen(argv[optind], "rb"); if(in == NULL || ferror(in)) { - wcerr << "Error: Cannot not open file '" << argv[optind] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; exit(EXIT_FAILURE); } fstp.load(in); diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index 90c25b39..38fa5516 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -171,13 +171,13 @@ int main(int argc, char *argv[]) FILE *analyser = fopen(argv[1], "rb"); if(!analyser) { - wcerr << "Error: Cannot not open file '" << argv[1] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[1] << "'." << endl << endl; exit(EXIT_FAILURE); } FILE *bidix = fopen(argv[2], "rb"); if(!bidix) { - wcerr << "Error: Cannot not open file '" << argv[2] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl << endl; exit(EXIT_FAILURE); } @@ -205,7 +205,7 @@ int main(int argc, char *argv[]) FILE *output = fopen(argv[3], "wb"); if(!output) { - wcerr << "Error: Cannot not open file '" << argv[3] << "'." << endl << endl; + wcerr << "Error: Cannot open file '" << argv[3] << "'." << endl << endl; exit(EXIT_FAILURE); } diff --git a/lttoolbox/xsd/dix.xsd b/lttoolbox/xsd/dix.xsd index b7e5fc97..66896207 100644 --- a/lttoolbox/xsd/dix.xsd +++ b/lttoolbox/xsd/dix.xsd @@ -120,6 +120,7 @@ + diff --git a/tests/data/intergen.dix b/tests/data/intergen.dix new file mode 100644 index 00000000..d571ebfb --- /dev/null +++ b/tests/data/intergen.dix @@ -0,0 +1,10 @@ + + + + + + +
+

dónadona

+
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index 4ebfc77b..3f325f82 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -85,5 +85,12 @@ class AllEntryWeights(unittest.TestCase, ProcTest): procflags = ["-W"] inputs = ["nanow"] expectedOutputs = ["^nanow/nan/nan/nan/nan$"] + +class Intergeneration(unittest.TestCase, ProcTest): + procdix = "data/intergen.dix" + procflags = ["-x"] + inputs = ["la dona ~dóna tot"] + expectedOutputs = ["la dona dona tot"] + # These fail on some systems: #from null_flush_invalid_stream_format import *