diff --git a/.gitignore b/.gitignore index 92eb308d..c935d8c3 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,9 @@ /lttoolbox/lt-append /lttoolbox/lsx-comp /lttoolbox/lt-paradigm +/lttoolbox/lt-invert +/lttoolbox/lt-restrict +/lttoolbox/lt-apply-acx /python/Makefile /python/Makefile.in /python/lttoolbox.i diff --git a/configure.ac b/configure.ac index 1a329446..9a1da36b 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ AC_PREREQ(2.52) m4_define([PKG_VERSION_MAJOR], [3]) m4_define([PKG_VERSION_MINOR], [6]) -m4_define([PKG_VERSION_PATCH], [9]) +m4_define([PKG_VERSION_PATCH], [10]) AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox]) diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 4f031cd0..ea4c7289 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,11 +1,11 @@ -h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ +h_sources = acx.h alphabet.h att_compiler.h buffer.h cli.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h file_utils.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h sorted_vector.hpp -cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ +cc_sources = acx.cc alphabet.cc att_compiler.cc cli.cc compiler.cc compression.cc entry_token.cc \ expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ @@ -14,7 +14,7 @@ cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token. library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) library_include_HEADERS = $(h_sources) -bin_PROGRAMS = lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-print lt-trim lt-append lsx-comp +bin_PROGRAMS = lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-print lt-trim lt-append lsx-comp lt-invert lt-restrict lt-apply-acx instdir = lttoolbox lib_LTLIBRARIES= liblttoolbox3.la @@ -41,6 +41,9 @@ lt_paradigm_SOURCES = lt_paradigm.cc lt_tmxcomp_SOURCES = lt_tmxcomp.cc lt_tmxproc_SOURCES = lt_tmxproc.cc lsx_comp_SOURCES = lt_comp.cc +lt_invert_SOURCES = lt_invert.cc +lt_restrict_SOURCES = lt_restrict.cc +lt_apply_acx_SOURCES = lt_apply_acx.cc #lt-validate-dictionary: Makefile.am validate-header.sh # @echo "Creating lt-validate-dictionary script" diff --git a/lttoolbox/acx.cc b/lttoolbox/acx.cc new file mode 100644 index 00000000..8de7a7ee --- /dev/null +++ b/lttoolbox/acx.cc @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include + +const xmlChar* CHAR_NODE = (const xmlChar*)"char"; +const xmlChar* EQUIV_NODE = (const xmlChar*)"equiv-char"; +const char* VALUE_ATTR = "value"; + +int32_t get_val(xmlNode* node) +{ + UString s = getattr(node, VALUE_ATTR); + if (s.empty()) { + error_and_die(node, "Missing value attribute."); + } + std::vector v; + ustring_to_vec32(s, v); + if (v.size() > 1) { + error_and_die(node, "Expected a single character in value attribute, but found %d.", v.size()); + } + return v[0]; +} + +std::map> readACX(const char* file) +{ + std::map> acx; + xmlNode* top_node = load_xml(file); + for (auto char_node : children(top_node)) { + if (!xmlStrEqual(char_node->name, CHAR_NODE)) { + error_and_die(char_node, "Expected but found <%s>.", + (const char*)char_node->name); + } + int32_t key = get_val(char_node); + sorted_vector vec; + for (auto equiv_node : children(char_node)) { + if (!xmlStrEqual(equiv_node->name, EQUIV_NODE)) { + error_and_die(char_node, "Expected but found <%s>.", + (const char*)equiv_node->name); + } + vec.insert(get_val(equiv_node)); + } + if (!vec.empty()) { + acx.insert(std::make_pair(key, vec)); + } + } + return acx; +} diff --git a/lttoolbox/acx.h b/lttoolbox/acx.h new file mode 100644 index 00000000..3f7223d7 --- /dev/null +++ b/lttoolbox/acx.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _ACXPARSEUTIL_ +#define _ACXPARSEUTIL_ + +#include +#include + +std::map> readACX(const char* file); + +#endif diff --git a/lttoolbox/cli.cc b/lttoolbox/cli.cc new file mode 100644 index 00000000..ce35a355 --- /dev/null +++ b/lttoolbox/cli.cc @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include +#include +#include +#include +#include + +CLI::CLI(std::string desc, std::string ver) +{ + description = desc; + version = ver; +} + +CLI::CLI(std::string desc) +{ + description = desc; +} + +CLI::~CLI() +{ +} + +void CLI::add_str_arg(char short_flag, std::string long_flag, + std::string desc, std::string arg) +{ + options.push_back({.short_opt=short_flag, .long_opt=long_flag, + .desc=desc, .is_bool=false, .var=arg}); +} + +void CLI::add_bool_arg(char short_flag, std::string long_flag, + std::string desc) +{ + options.push_back({.short_opt=short_flag, .long_opt=long_flag, + .desc=desc, .is_bool=true, .var=""}); +} + +void CLI::add_file_arg(std::string name, bool optional) +{ + file_args.push_back(std::make_pair(name, optional)); + if (!optional) min_file_args++; +} + +void CLI::set_epilog(std::string e) +{ + epilog = e; +} + +void CLI::print_usage() +{ + if (!prog_name.empty()) { + std::cout << prog_name; + if (!version.empty()) { + std::cout << " v" << version; + } + std::cout << ": " << description << std::endl; + std::cout << "USAGE: " << prog_name; + std::string bargs; + std::string sargs; + for (auto& it : options) { + if (it.is_bool) { + bargs += it.short_opt; + } else { + sargs += " [-"; + sargs += it.short_opt; + sargs += ' '; + sargs += it.var; + sargs += ']'; + } + } + if (!bargs.empty()) { + std::cout << " [-" << bargs << "]"; + } + std::cout << sargs; + int depth = 0; + for (auto& it : file_args) { + std::cout << ' '; + if (it.second) { + std::cout << '['; + depth += 1; + } + std::cout << it.first; + } + while (depth-- > 0) std::cout << "]"; + std::cout << std::endl; + for (auto& it : options) { + std::cout << " -" << it.short_opt; +#if HAVE_GETOPT_LONG + std::cout << ", --" << it.long_opt << ':'; + for (size_t i = it.long_opt.size(); i < 20; i++) { + std::cout << ' '; + } +#else + std::cout << ": "; +#endif + std::cout << it.desc << std::endl; + } + if (!epilog.empty()) { + std::cout << epilog << std::endl; + } + } + exit(EXIT_FAILURE); +} + +void CLI::parse_args(int argc, char* argv[]) +{ + prog_name = basename(argv[0]); + std::string arg_str; +#if HAVE_GETOPT_LONG + struct option long_options[options.size()]; + int option_index = 0; +#endif + for (size_t i = 0; i < options.size(); i++) { + arg_str += options[i].short_opt; + if (!options[i].is_bool) arg_str += ':'; +#if HAVE_GETOPT_LONG + long_options[i].name = options[i].long_opt.c_str(); + long_options[i].has_arg = (options[i].is_bool ? no_argument : required_argument); + long_options[i].flag = 0; + long_options[i].val = options[i].short_opt; +#endif + } + + while (true) { +#if HAVE_GETOPT_LONG + int cnt = getopt_long(argc, argv, arg_str.c_str(), long_options, &option_index); +#else + int cnt = getopt(argc, argv, arg_str.c_str()); +#endif + if (cnt == -1) break; + + bool found = false; + for (auto& it : options) { + if (it.short_opt == cnt) { + found = true; + if (it.short_opt == 'v' && it.long_opt == "version") { + std::cout << prog_name << " version " << version << std::endl; + exit(EXIT_SUCCESS); + } + if (it.is_bool) { + bools[it.long_opt] = true; + } else { + strs[it.long_opt].push_back(optarg); + } + break; + } + } + if (!found || cnt == 'h') { + print_usage(); + } + } + while (optind < argc) { + files.push_back(argv[optind++]); + } + if (files.size() < min_file_args || files.size() > file_args.size()) { + print_usage(); + } + while (files.size() < file_args.size()) { + files.push_back(""); + } +} + +std::map>& CLI::get_strs() +{ + return strs; +} + +std::map& CLI::get_bools() +{ + return bools; +} + +std::vector& CLI::get_files() +{ + return files; +} diff --git a/lttoolbox/cli.h b/lttoolbox/cli.h new file mode 100644 index 00000000..df6acfb2 --- /dev/null +++ b/lttoolbox/cli.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include + +class CLI { +private: + struct CLIOption { + char short_opt; + std::string long_opt; + std::string desc; + bool is_bool; + std::string var; + }; + + std::string description; + std::string version; + std::string epilog; + + std::vector options; + std::vector> file_args; + size_t min_file_args = 0; + + std::map> strs; + std::map bools; + std::vector files; + + std::string prog_name; + +public: + CLI(std::string desc, std::string version); + CLI(std::string desc); + ~CLI(); + void add_str_arg(char short_flag, std::string long_flag, std::string desc, + std::string arg); + void add_bool_arg(char short_flag, std::string long_flag, std::string desc); + void add_file_arg(std::string name, bool optional = true); + void set_epilog(std::string e); + void print_usage(); + void parse_args(int argc, char* argv[]); + std::map>& get_strs(); + std::map& get_bools(); + std::vector& get_files(); +}; diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 73d3ad17..c1cc912a 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -15,17 +15,14 @@ * along with this program; if not, see . */ #include -#include -#include -#include #include #include #include +#include +#include -#include -#include #include -#include +#include UString const Compiler::COMPILER_DICTIONARY_ELEM = "dictionary"_u; UString const Compiler::COMPILER_ALPHABET_ELEM = "alphabet"_u; @@ -39,6 +36,7 @@ UString const Compiler::COMPILER_ENTRY_ELEM = "e"_u; UString const Compiler::COMPILER_RESTRICTION_ATTR = "r"_u; UString const Compiler::COMPILER_RESTRICTION_LR_VAL = "LR"_u; UString const Compiler::COMPILER_RESTRICTION_RL_VAL = "RL"_u; +UString const Compiler::COMPILER_RESTRICTION_U_VAL = "U"_u; UString const Compiler::COMPILER_PAIR_ELEM = "p"_u; UString const Compiler::COMPILER_LEFT_ELEM = "l"_u; UString const Compiler::COMPILER_RIGHT_ELEM = "r"_u; @@ -90,31 +88,20 @@ Compiler::parseACX(std::string const &file, UString const &dir) { if(dir == COMPILER_RESTRICTION_LR_VAL) { - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - std::cerr << "Error: cannot open '" << file << "'." << std::endl; - exit(EXIT_FAILURE); - } - int ret = xmlTextReaderRead(reader); - while(ret == 1) - { - procNodeACX(); - ret = xmlTextReaderRead(reader); - } + acx_map = readACX(file.c_str()); } } void Compiler::parse(std::string const &file, UString const &dir) { - direction = dir; - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - std::cerr << "Error: Cannot open '" << file << "'." << std::endl; - exit(EXIT_FAILURE); + if (dir == COMPILER_RESTRICTION_U_VAL) { + direction = COMPILER_RESTRICTION_LR_VAL; + unified_compilation = true; + } else { + direction = dir; } + reader = XMLParseUtil::open_or_exit(file.c_str()); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -284,7 +271,7 @@ Compiler::matchTransduction(std::vector const &pi, } else { - std::map >::iterator acx_map_ptr; + std::map >::iterator acx_map_ptr; int rsymbol = 0; while(true) @@ -350,7 +337,7 @@ Compiler::matchTransduction(std::vector const &pi, { for(auto& it : acx_map_ptr->second) { - t.linkStates(state, new_state, alphabet(it ,rsymbol), weight_value); + t.linkStates(state, new_state, alphabet(it, rsymbol), weight_value); } } state = new_state; @@ -375,15 +362,7 @@ Compiler::requireEmptyError(UString const &name) bool Compiler::allBlanks() { - bool flag = true; - UString text = XMLParseUtil::readValue(reader); - - for(auto c : text) - { - flag = flag && u_isspace(c); - } - - return flag; + return XMLParseUtil::allBlanks(reader); } void @@ -789,6 +768,39 @@ Compiler::procSection() } } +bool +Compiler::filterEntry(const UString& value, const UString& filter, + bool keep_on_empty_filter) +{ + if (value.empty()) return true; + else if (keep_on_empty_filter && filter.empty()) return true; + auto ops = StringUtils::split(value, " "_u); + for (auto& it : ops) { + if (it == filter) return true; + } + return false; +} + +void +Compiler::symbolFilters(const UString& value, const UString& prefix, + std::vector>& symbols) +{ + if (value.empty()) return; + std::vector syms; + for (auto& it : StringUtils::split(value, " "_u)) { + if (it.empty()) continue; + UString tag; + tag += '<'; + tag += prefix; + tag += ':'; + tag += it; + tag += '>'; + alphabet.includeSymbol(tag); + syms.push_back(alphabet(tag)); + } + if (!syms.empty()) symbols.push_back(syms); +} + void Compiler::procEntry() { @@ -800,13 +812,74 @@ Compiler::procEntry() UString varr = this->attrib(COMPILER_VR_ATTR); UString wsweight = this->attrib(COMPILER_WEIGHT_ATTR); + std::vector elements; + // if entry is masked by a restriction of direction or an ignore mark - if((!attribute.empty() && attribute != direction) + if (unified_compilation && ignore != COMPILER_IGNORE_YES_VAL) { + std::vector> symbols; + symbolFilters(attribute, "r"_u, symbols); + symbolFilters(altval, "alt"_u, symbols); + symbolFilters(varval, "v"_u, symbols); + symbolFilters(varl, "vl"_u, symbols); + symbolFilters(varr, "vr"_u, symbols); + if (!symbols.empty()) { + bool multi = false; + for (auto& it : symbols) { + if (it.size() > 1) { + multi = true; + break; + } + } + if (multi) { + UString parname = "--"_u; + parname += attribute; + parname += '-'; + parname += altval; + parname += '-'; + parname += varval; + parname += '-'; + parname += varl; + parname += '-'; + parname += varr; + if (paradigms.find(parname) == paradigms.end()) { + std::vector re; + for (auto& it : symbols) { + if (it.size() == 1) { + re.push_back(it[0]); + } else { + re.push_back(static_cast('[')); + re.insert(re.end(), it.begin(), it.end()); + re.push_back(static_cast(']')); + } + } + EntryToken e; + e.setRegexp(re); + std::vector vec(1, e); + parname.swap(current_paradigm); + insertEntryTokens(vec); + parname.swap(current_paradigm); + } + EntryToken e; + e.setParadigm(parname); + elements.push_back(e); + } + else { + std::vector syms; + for (auto& it : symbols) { + syms.push_back(it[0]); + } + EntryToken e; + e.setSingleTransduction(syms, syms); + elements.push_back(e); + } + } + } + else if((!attribute.empty() && attribute != direction) || ignore == COMPILER_IGNORE_YES_VAL - || (!altval.empty() && altval != alt) - || (!varval.empty() && !variant.empty() && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right)) + || !filterEntry(altval, alt, false) + || !filterEntry(varval, variant, true) + || (direction == COMPILER_RESTRICTION_RL_VAL && !filterEntry(varl, variant_left, false)) + || (direction == COMPILER_RESTRICTION_LR_VAL && !filterEntry(varr, variant_right, false))) { // parse to the end of the entry UString name; @@ -826,8 +899,6 @@ Compiler::procEntry() weight = StringUtils::stod(wsweight); } - std::vector elements; - if (entry_debugging && current_paradigm.empty()) { UString ln = "Line near "_u; ln += StringUtils::itoa(xmlTextReaderGetParserLineNumber(reader)); @@ -932,38 +1003,6 @@ Compiler::procEntry() } } -void -Compiler::procNodeACX() -{ - UString name = XMLParseUtil::readName(reader); - if(name == COMPILER_TEXT_NODE) - { - /* ignore */ - } - else if(name == COMPILER_ACX_ANALYSIS_ELEM) - { - /* ignore */ - } - else if(name == COMPILER_ACX_CHAR_ELEM) - { - acx_current_char = static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0]); - } - else if(name == COMPILER_ACX_EQUIV_CHAR_ELEM) - { - acx_map[acx_current_char].insert(static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0])); - } - else if(name == COMPILER_COMMENT_NODE) - { - /* ignore */ - } - else - { - std::cerr << "Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); - std::cerr << "): Invalid node '<" << name << ">'." << std::endl; - exit(EXIT_FAILURE); - } -} - void Compiler::procNode() { diff --git a/lttoolbox/compiler.h b/lttoolbox/compiler.h index 8acb9ab3..3c1dffb4 100644 --- a/lttoolbox/compiler.h +++ b/lttoolbox/compiler.h @@ -18,14 +18,12 @@ #define _MYCOMPILER_ #include -#include #include #include #include +#include -#include #include -#include #include #include @@ -97,6 +95,13 @@ class Compiler */ UString direction; + /** + * If this is set to true, attributes v, vl, vr, r, and alt + * insert special symbols to be filtered by lt-restrict rather than + * ignoring entries. + */ + bool unified_compilation = false; + /** * List of characters to be considered alphabetic */ @@ -171,12 +176,7 @@ class Compiler /** * Mapping of aliases of characters specified in ACX files */ - std::map > acx_map; - - /** - * Original char being mapped - */ - int acx_current_char = 0; + std::map > acx_map; /** * LSX symbols @@ -187,22 +187,11 @@ class Compiler int32_t word_boundary_s = 0; int32_t word_boundary_ns = 0; - /* - static std::string range(char const a, char const b); - std::string readAlphabet(); - */ - /** * Method to parse an XML Node */ void procNode(); - /** - * Method to parse an XML Node in ACX files - */ - void procNodeACX(); - - /** * Parse the <alphabet> element */ @@ -223,6 +212,15 @@ class Compiler */ void procEntry(); + /** + * Return true if the filter (command line) is consistent with + * the value (attribute) and false otherwise + */ + bool filterEntry(const UString& value, const UString& filter, + bool keep_on_empty_filter); + void symbolFilters(const UString& value, const UString& prefix, + std::vector>& symbols); + /** * Parse the <re> element * @return a list of tokens from the dictionary's entry @@ -341,6 +339,7 @@ class Compiler LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_ATTR; LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_LR_VAL; LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_RL_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_U_VAL; LTTOOLBOX_IMPORTS static UString const COMPILER_PAIR_ELEM; LTTOOLBOX_IMPORTS static UString const COMPILER_LEFT_ELEM; LTTOOLBOX_IMPORTS static UString const COMPILER_RIGHT_ELEM; diff --git a/lttoolbox/entry_token.cc b/lttoolbox/entry_token.cc index 06da6203..0452e524 100644 --- a/lttoolbox/entry_token.cc +++ b/lttoolbox/entry_token.cc @@ -84,6 +84,13 @@ EntryToken::setRegexp(UString const &r) type = regexp; } +void +EntryToken::setRegexp(const std::vector& r) +{ + myregexp = r; + type = regexp; +} + void EntryToken::readRegexp(xmlTextReaderPtr reader) { diff --git a/lttoolbox/entry_token.h b/lttoolbox/entry_token.h index 62268474..ff3f7cf4 100644 --- a/lttoolbox/entry_token.h +++ b/lttoolbox/entry_token.h @@ -114,6 +114,7 @@ class EntryToken * @param r the regular expression specification. */ void setRegexp(UString const &r); + void setRegexp(const std::vector& r); /** * More efficient version of setRegexp() diff --git a/lttoolbox/expander.cc b/lttoolbox/expander.cc index a7629a9c..31c5ca22 100644 --- a/lttoolbox/expander.cc +++ b/lttoolbox/expander.cc @@ -17,19 +17,15 @@ #include #include -#include #include -#include #include #include #include -Expander::Expander() : -reader(0) +Expander::Expander() { - LtLocale::tryToSetLocale(); } Expander::~Expander() @@ -39,12 +35,7 @@ Expander::~Expander() void Expander::expand(std::string const &file, UFILE* output) { - reader = xmlReaderForFile(file.c_str(), NULL, 0); - if(reader == NULL) - { - std::cerr << "Error: Cannot open '" << file << "'." << std::endl; - exit(EXIT_FAILURE); - } + reader = XMLParseUtil::open_or_exit(file.c_str()); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -91,15 +82,7 @@ Expander::requireEmptyError(UString const &name) bool Expander::allBlanks() { - bool flag = true; - UString text = XMLParseUtil::readValue(reader); - - for(auto c : text) - { - flag = flag && isspace(c); - } - - return flag; + return XMLParseUtil::allBlanks(reader); } void diff --git a/lttoolbox/expander.h b/lttoolbox/expander.h index a87bd0ee..0bf27bc9 100644 --- a/lttoolbox/expander.h +++ b/lttoolbox/expander.h @@ -19,12 +19,11 @@ #include -#include #include #include #include -typedef std::list > EntList; +typedef std::vector > EntList; /** * An expander of dictionaries @@ -35,7 +34,7 @@ class Expander /** * The libxml2's XML reader */ - xmlTextReaderPtr reader; + xmlTextReaderPtr reader = nullptr; /** * The alt value @@ -178,8 +177,8 @@ class Expander * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(std::list > &result, - std::list > const &endings); + static void append(EntList &result, + EntList const &endings); /** * Append a list of endings to a list of current transductions. @@ -187,7 +186,7 @@ class Expander * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(std::list > &result, + static void append(EntList &result, UString const &endings); /** @@ -196,7 +195,7 @@ class Expander * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(std::list > &result, + static void append(EntList &result, std::pair const &endings); public: diff --git a/lttoolbox/file_utils.cc b/lttoolbox/file_utils.cc index 98c0a006..d449fbf8 100644 --- a/lttoolbox/file_utils.cc +++ b/lttoolbox/file_utils.cc @@ -85,6 +85,14 @@ writeTransducerSet(FILE* output, const UString& letters, } } +void +writeTransducerSet(FILE* output, const std::set& letters, + Alphabet& alpha, + std::map& trans) +{ + writeTransducerSet(output, UString(letters.begin(), letters.end()), alpha, trans); +} + void readShared(FILE* input, std::set& letters, Alphabet& alpha) { diff --git a/lttoolbox/file_utils.h b/lttoolbox/file_utils.h index a2c81285..f32869b7 100644 --- a/lttoolbox/file_utils.h +++ b/lttoolbox/file_utils.h @@ -30,6 +30,9 @@ FILE* openInBinFile(const std::string& fname); void writeTransducerSet(FILE* output, const UString& letters, Alphabet& alpha, std::map& trans); +void writeTransducerSet(FILE* output, const std::set& letters, + Alphabet& alpha, + std::map& trans); void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, std::map& trans); diff --git a/lttoolbox/lt_append.cc b/lttoolbox/lt_append.cc index df0be9a4..a8358fbe 100644 --- a/lttoolbox/lt_append.cc +++ b/lttoolbox/lt_append.cc @@ -15,106 +15,29 @@ * along with this program; if not, see . */ #include -#include #include - -#include +#include #include - -#include #include -#include -#include -#include -#include - -void endProgram(char *name) -{ - if(name != NULL) - { - std::cout << basename(name) << " v" << PACKAGE_VERSION <<": add sections to a compiled transducer" << std::endl; - std::cout << "USAGE: " << basename(name) << " [-ksh] bin_file1 bin_file2 output_file" << std::endl; - std::cout << " -k, --keep: in case of section name conflicts, keep the one from the first transducer" << std::endl; - std::cout << " -s, --single: treat input transducers as one-sided" << std::endl; - std::cout << " -h, --help: print this message and exit" << std::endl; - } - exit(EXIT_FAILURE); -} - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - - bool pairs = true; - bool keep = false; - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"keep", no_argument, 0, 'k'}, - {"single", no_argument, 0, 's'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "ksh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "ksh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'k': - keep = true; - break; - - case 's': - pairs = false; - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } - } - - std::string infile1; - std::string infile2; - std::string outfile; - switch(argc - optind) - { - case 1: - infile1 = argv[argc-1]; - break; - - case 2: - infile1 = argv[argc-2]; - infile2 = argv[argc-1]; - break; - - case 3: - infile1 = argv[argc-3]; - infile2 = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } - - FILE* input1 = openInBinFile(infile1); - FILE* input2 = openInBinFile(infile2); - FILE* output = openOutBinFile(outfile); + CLI cli("add sections to a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('k', "keep", "in case of section name conflicts, keep the one from the first transducer"); + cli.add_bool_arg('s', "single", "treat input transducers as one-sided"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("bin_file1", false); + cli.add_file_arg("bin_file2"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + bool pairs = !cli.get_bools()["single"]; + bool keep = cli.get_bools()["keep"]; + + FILE* input1 = openInBinFile(cli.get_files()[0]); + FILE* input2 = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); Alphabet alpha1, alpha2; std::set chars1, chars2; diff --git a/lttoolbox/lt_apply_acx.cc b/lttoolbox/lt_apply_acx.cc new file mode 100644 index 00000000..fa5d079b --- /dev/null +++ b/lttoolbox/lt_apply_acx.cc @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("apply an ACX file to a compiled transducer", PACKAGE_VERSION); + cli.add_file_arg("input_file", false); + cli.add_file_arg("acx_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + FILE* input = openInBinFile(cli.get_files()[0]); + auto acx = readACX(cli.get_files()[1].c_str()); + FILE* output = openOutBinFile(cli.get_files()[2]); + + Alphabet alpha; + std::set letters; + std::map trans; + readTransducerSet(input, letters, alpha, trans); + + for (auto& it : trans) { + it.second.applyACX(alpha, acx); + } + + writeTransducerSet(output, letters, alpha, trans); + + fclose(input); + fclose(output); + return 0; +} diff --git a/lttoolbox/lt_comp.cc b/lttoolbox/lt_comp.cc index dad2cf98..145db6c2 100644 --- a/lttoolbox/lt_comp.cc +++ b/lttoolbox/lt_comp.cc @@ -17,12 +17,10 @@ #include #include #include +#include +#include -#include #include -#include -#include -#include /* * Error function that does nothing so that when we fallback from @@ -34,138 +32,58 @@ void errorFunc(void *ctx, const char *msg, ...) return; } -void endProgram(char *name) -{ - if(name != NULL) - { - std::cout << basename(name) << " v" << PACKAGE_VERSION <<": build a letter transducer from a dictionary" << std::endl; - std::cout << "USAGE: " << basename(name) << " [-hmvalrHSj] lr | rl dictionary_file output_file [acx_file]" << std::endl; -#if HAVE_GETOPT_LONG - std::cout << " -d, --debug: insert line numbers before each entry" << std::endl; - std::cout << " -m, --keep-boundaries: keep morpheme boundaries" << std::endl; - std::cout << " -v, --var: set language variant" << std::endl; - std::cout << " -a, --alt: set alternative (monodix)" << std::endl; - std::cout << " -l, --var-left: set left language variant (bidix)" << std::endl; - std::cout << " -r, --var-right: set right language variant (bidix)" << std::endl; - std::cout << " -H, --hfst: expect HFST symbols" << std::endl; - std::cout << " -S, --no-split: don't attempt to split into word and punctuation transducers" << std::endl; - std::cout << " -j, --jobs: use one cpu core per section when minimising, new section after 50k entries" << std::endl; -#else - std::cout << " -d: insert line numbers before each entry" << std::endl; - std::cout << " -m: keep morpheme boundaries" << std::endl; - std::cout << " -v: set language variant" << std::endl; - std::cout << " -a: set alternative (monodix)" << std::endl; - std::cout << " -l: set left language variant (bidix)" << std::endl; - std::cout << " -r: set right language variant (bidix)" << std::endl; - std::cout << " -H: expect HFST symbols" << std::endl; - std::cout << " -S: don't attempt to split into word and punctuation transducers" << std::endl; - std::cout << " -j: use one cpu core per section when minimising, new section after 50k entries" << std::endl; -#endif - std::cout << "Modes:" << std::endl; - std::cout << " lr: left-to-right compilation" << std::endl; - std::cout << " rl: right-to-left compilation" << std::endl; - } - exit(EXIT_FAILURE); -} - - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("build a letter transducer from a dictionary", PACKAGE_VERSION); + cli.add_bool_arg('d', "debug", "insert line numbers before each entry"); + cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_bool_arg('H', "hfst", "expect HFST symbols"); + cli.add_bool_arg('S', "no-split", "don't attempt to split into word and punctuation sections"); + cli.add_bool_arg('j', "jobs", "use one cpu core per section when minimising, new section after 50k entries"); + cli.add_bool_arg('V', "verbose", "compile verbosely"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("lr | rl | u", false); + cli.add_file_arg("dictionary_file", false); + cli.add_file_arg("output_file", false); + cli.add_file_arg("acx_file", true); + cli.parse_args(argc, argv); char ttype = 'x'; Compiler c; AttCompiler a; - c.setKeepBoundaries(false); - c.setVerbose(false); - c.setEntryDebugging(false); - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - std::string vl; - std::string vr; - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"alt", required_argument, 0, 'a'}, - {"var", required_argument, 0, 'v'}, - {"var-left", required_argument, 0, 'l'}, - {"var-right", required_argument, 0, 'r'}, - {"debug", no_argument, 0, 'd'}, - {"keep-boundaries", no_argument, 0, 'm'}, - {"hfst", no_argument, 0, 'H'}, - {"no-split", no_argument, 0, 'S'}, - {"help", no_argument, 0, 'h'}, - {"verbose", no_argument, 0, 'V'}, - {"jobs", no_argument, 0, 'j'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "a:v:l:r:dmHShVj", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "a:v:l:r:dmHShV"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - c.setAltValue(to_ustring(optarg)); - break; - - case 'v': - c.setVariantValue(to_ustring(optarg)); - break; - - case 'l': - vl = optarg; - c.setVariantLeftValue(to_ustring(optarg)); - break; - - case 'r': - vr = optarg; - c.setVariantRightValue(to_ustring(optarg)); - break; - - case 'd': - c.setEntryDebugging(true); - break; - - case 'm': - c.setKeepBoundaries(true); - break; - - case 'H': - a.setHfstSymbols(true); - break; - - case 'S': - a.setSplitting(false); - break; - case 'j': - c.setJobs(true); - c.setMaxSectionEntries(50000); - break; + bool have_vl = false; + bool have_vr = false; + auto args = cli.get_strs(); + if (args.find("var") != args.end()) { + c.setVariantValue(to_ustring(args["var"][0].c_str())); + } + if (args.find("alt") != args.end()) { + c.setAltValue(to_ustring(args["alt"][0].c_str())); + } + if (args.find("var-left") != args.end()) { + have_vl = true; + c.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); + } + if (args.find("var-right") != args.end()) { + have_vr = true; + c.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); + } - case 'V': - c.setVerbose(true); - break; + c.setEntryDebugging(cli.get_bools()["debug"]); + c.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); + c.setVerbose(cli.get_bools()["verbose"]); - case 'h': - default: - endProgram(argv[0]); - break; - } - } + a.setHfstSymbols(cli.get_bools()["hfst"]); + a.setSplitting(!cli.get_bools()["no-split"]); auto LT_JOBS = std::getenv("LT_JOBS"); - if(LT_JOBS != NULL && LT_JOBS[0] != 'n') { + if(cli.get_bools()["jobs"] || (LT_JOBS != NULL && LT_JOBS[0] != 'n')) { c.setJobs(true); c.setMaxSectionEntries(50000); } @@ -177,30 +95,10 @@ int main(int argc, char *argv[]) c.setMaxSectionEntries(std::stol(max_section_entries)); } - std::string opc; - std::string infile; - std::string outfile; - std::string acxfile; - - switch(argc - optind + 1) - { - case 5: - opc = argv[argc-4]; - infile = argv[argc-3]; - outfile = argv[argc-2]; - acxfile = argv[argc-1]; - break; - - case 4: - opc = argv[argc-3]; - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } + std::string opc = cli.get_files()[0]; + std::string infile = cli.get_files()[1]; + std::string outfile = cli.get_files()[2]; + std::string acxfile = cli.get_files()[3]; xmlTextReaderPtr reader; reader = xmlReaderForFile(infile.c_str(), NULL, 0); @@ -226,10 +124,9 @@ int main(int argc, char *argv[]) if(opc == "lr") { - if(vr == "" && vl != "") - { + if (have_vr && !have_vl) { std::cout << "Error: -l specified, but mode is lr" << std::endl; - endProgram(argv[0]); + cli.print_usage(); } if(ttype == 'a') { @@ -237,7 +134,7 @@ int main(int argc, char *argv[]) } else { - if(acxfile != "") + if(!acxfile.empty()) { c.parseACX(acxfile, Compiler::COMPILER_RESTRICTION_LR_VAL); } @@ -246,10 +143,9 @@ int main(int argc, char *argv[]) } else if(opc == "rl") { - if(vl == "" && vr != "") - { + if (have_vl && !have_vr) { std::cout << "Error: -r specified, but mode is rl" << std::endl; - endProgram(argv[0]); + cli.print_usage(); } if(ttype == 'a') { @@ -260,17 +156,19 @@ int main(int argc, char *argv[]) c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL); } } + else if (opc == "u") { + if (ttype == 'a') { + a.parse(infile, false); + } else { + c.parse(infile, Compiler::COMPILER_RESTRICTION_U_VAL); + } + } else { - endProgram(argv[0]); + cli.print_usage(); } - FILE *output = fopen(outfile.c_str(), "wb"); - if(!output) - { - std::cerr << "Error: Cannot open file '" << outfile << "'." << std::endl; - exit(EXIT_FAILURE); - } + FILE* output = openOutBinFile(outfile); if(ttype == 'a') { a.write(output); diff --git a/lttoolbox/lt_expand.cc b/lttoolbox/lt_expand.cc index be3c4f5c..c472b77c 100644 --- a/lttoolbox/lt_expand.cc +++ b/lttoolbox/lt_expand.cc @@ -18,122 +18,40 @@ #include #include #include - -#include -#include -#include -#include -#include - -void endProgram(char *name) -{ - if(name != NULL) - { - std::cout << basename(name) << " v" << PACKAGE_VERSION <<": expand the contents of a dictionary file" << std::endl; - std::cout << "USAGE: " << basename(name) << " [-mavlrh] dictionary_file [output_file]" << std::endl; -#if HAVE_GETOPT_LONG - std::cout << " -m, --keep-boundaries: keep morpheme boundaries" << std::endl; - std::cout << " -v, --var: set language variant" << std::endl; - std::cout << " -a, --alt: set alternative (monodix)" << std::endl; - std::cout << " -l, --var-left: set left language variant (bidix)" << std::endl; - std::cout << " -r, --var-right: set right language variant (bidix)" << std::endl; -#else - std::cout << " -m: keep morpheme boundaries" << std::endl; - std::cout << " -v: set language variant" << std::endl; - std::cout << " -a: set alternative (monodix)" << std::endl; - std::cout << " -l: set left language variant (bidix)" << std::endl; - std::cout << " -r: set right language variant (bidix)" << std::endl; -#endif - } - exit(EXIT_FAILURE); -} +#include int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("expand the contents of a dictionary file", PACKAGE_VERSION); + cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_file_arg("dictionary_file", false); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); - FILE* input = NULL; - UFILE* output = NULL; Expander e; - e.setKeepBoundaries(false); - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"keep-boundaries", no_argument, 0, 'm'}, - {"alt", required_argument, 0, 'a'}, - {"var", required_argument, 0, 'v'}, - {"var-left", required_argument, 0, 'l'}, - {"var-right", required_argument, 0, 'r'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "a:v:l:r:mh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "a:v:l:r:mh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - e.setAltValue(to_ustring(optarg)); - break; - - case 'v': - e.setVariantValue(to_ustring(optarg)); - break; - - case 'l': - e.setVariantLeftValue(to_ustring(optarg)); - break; - - case 'm': - e.setKeepBoundaries(true); - break; - - case 'r': - e.setVariantRightValue(to_ustring(optarg)); - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } + e.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); + auto args = cli.get_strs(); + if (args.find("var") != args.end()) { + e.setVariantValue(to_ustring(args["var"][0].c_str())); } - - std::string infile; - std::string outfile; - - switch(argc - optind + 1) - { - case 2: - infile = argv[argc-1]; - break; - - case 3: - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; + if (args.find("alt") != args.end()) { + e.setAltValue(to_ustring(args["alt"][0].c_str())); + } + if (args.find("var-left") != args.end()) { + e.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); + } + if (args.find("var-right") != args.end()) { + e.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); } - input = openInBinFile(infile); - fclose(input); - output = openOutTextFile(outfile); + UFILE* output = openOutTextFile(cli.get_files()[1]); - e.expand(infile, output); + e.expand(cli.get_files()[0], output); u_fclose(output); return EXIT_SUCCESS; diff --git a/lttoolbox/lt_invert.cc b/lttoolbox/lt_invert.cc new file mode 100644 index 00000000..bbe55848 --- /dev/null +++ b/lttoolbox/lt_invert.cc @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + + CLI cli("reverse the direction of a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("in_bin"); + cli.add_file_arg("out_bin"); + cli.parse_args(argc, argv); + + FILE* input = openInBinFile(cli.get_files()[0]); + FILE* output = openOutBinFile(cli.get_files()[1]); + + Alphabet alphabet; + std::set alphabetic_chars; + std::map transducers; + readTransducerSet(input, alphabetic_chars, alphabet, transducers); + + for (auto& it : transducers) { + it.second.invert(alphabet); + } + + writeTransducerSet(output, alphabetic_chars, alphabet, transducers); + + fclose(input); + fclose(output); + return EXIT_SUCCESS; +} diff --git a/lttoolbox/lt_paradigm.cc b/lttoolbox/lt_paradigm.cc index 0c276ab9..165c44db 100644 --- a/lttoolbox/lt_paradigm.cc +++ b/lttoolbox/lt_paradigm.cc @@ -20,21 +20,10 @@ #include #include #include +#include -#include -#include -#include #include -void endProgram(char* name) -{ - std::cout << basename(name) << ": generate listings from a compiled transducer" << std::endl; - std::cout << "Usage: " << basename(name) << " [ -a ] FST [ input [ output ] ]" << std::endl; - std::cout << " -a, --analyser: FST is an analyser (tags on the right)" << std::endl; - std::cout << " -h, --help: Print this help and exit" << std::endl; - exit(EXIT_FAILURE); -} - void expand(Transducer& inter, int state, const std::set& past_states, const std::vector& syms, const Alphabet& alpha, UFILE* out, std::set>& outset) @@ -112,59 +101,26 @@ void process(const UString& pattern, std::map& trans, int main(int argc, char* argv[]) { LtLocale::tryToSetLocale(); - - bool should_invert = true; - bool sort = false; + CLI cli("generate listings from a compiled transducer", PACKAGE_VERSION); + cli.add_bool_arg('a', "analyser", "FST is an analyser (tags on the right)"); + cli.add_str_arg('e', "exclude", "disregard paths containing TAG", "TAG"); + cli.add_bool_arg('s', "sort", "alphabetize the paths for each pattern"); + cli.add_bool_arg('z', "null-flush", "flush output on \\0"); + cli.add_bool_arg('h', "help", "show this help and exit"); + cli.add_file_arg("FST", false); + cli.add_file_arg("input"); + cli.add_file_arg("output"); + cli.parse_args(argc, argv); + + bool should_invert = !cli.get_bools()["analyser"]; + bool sort = cli.get_bools()["sort"]; std::set skip_tags; - -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"analyser", 0, 0, 'a'}, - {"exclude", 1, 0, 'e'}, - {"sort", 0, 0, 's'}, - {"null-flush", 0, 0, 'z'}, - {"help", 0, 0, 'h'}, - {0,0,0,0} - }; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - int c = getopt_long(argc, argv, "ae:szh", long_options, &optind); -#else - int c = getopt(argc, argv, "ae:szh"); -#endif - if (c == -1) break; - - switch (c) { - case 'a': - should_invert = false; - break; - - case 'e': - skip_tags.insert(to_ustring(optarg)); - break; - - case 's': - sort = true; - break; - - case 'z': // no-op - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } + for (auto& it : cli.get_strs()["exclude"]) { + skip_tags.insert(to_ustring(it.c_str())); } - if (optind == argc) { - std::cerr << "Transducer file is required." << std::endl; - exit(EXIT_FAILURE); - } - FILE* fst = openInBinFile(argv[optind++]); + FILE* fst = openInBinFile(cli.get_files()[0]); + std::set letters; Alphabet alpha; std::map trans; @@ -189,13 +145,10 @@ int main(int argc, char* argv[]) } InputFile input; - UFILE* output = u_finit(stdout, NULL, NULL); - if (optind < argc) { - input.open_or_exit(argv[optind++]); - } - if (optind < argc) { - output = openOutTextFile(argv[optind++]); + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); } + UFILE* output = openOutTextFile(cli.get_files()[2]); UString cur; do { diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index f83288d8..6619530b 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -15,98 +15,26 @@ * along with this program; if not, see . */ #include -#include #include - -#include +#include #include -#include -#include -#include -#include -#include -#include - -void endProgram(char *name) -{ - if(name != NULL) - { - std::cout << basename(name) << " v" << PACKAGE_VERSION <<": dump a transducer to text in ATT format" << std::endl; - std::cout << "USAGE: " << basename(name) << " [-aHh] bin_file [output_file] " << std::endl; - std::cout << " -a, --alpha: print transducer alphabet" << std::endl; - std::cout << " -H, --hfst: use HFST-compatible character escapes" << std::endl; - std::cout << " -h, --help: print this message and exit" << std::endl; - } - exit(EXIT_FAILURE); -} - - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - - bool alpha = false; - bool hfst = false; - -#if HAVE_GETOPT_LONG - int option_index=0; -#endif - - while (true) { -#if HAVE_GETOPT_LONG - static struct option long_options[] = - { - {"alpha", no_argument, 0, 'a'}, - {"hfst", no_argument, 0, 'H'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - - int cnt=getopt_long(argc, argv, "aHh", long_options, &option_index); -#else - int cnt=getopt(argc, argv, "aHh"); -#endif - if (cnt==-1) - break; - - switch (cnt) - { - case 'a': - alpha = true; - break; - - case 'H': - hfst = true; - break; - - case 'h': - default: - endProgram(argv[0]); - break; - } - } - - std::string infile; - std::string outfile; - switch(argc - optind) - { - case 1: - infile = argv[argc-1]; - break; - - case 2: - infile = argv[argc-2]; - outfile = argv[argc-1]; - break; - - default: - endProgram(argv[0]); - break; - } - - FILE* input = openInBinFile(infile); - UFILE* output = openOutTextFile(outfile); + CLI cli("dump a transducer to text in ATT format", PACKAGE_VERSION); + cli.add_bool_arg('a', "alpha", "print transducer alphabet"); + cli.add_bool_arg('H', "hfst", "use HFST-compatible character escapes"); + cli.add_bool_arg('h', "help", "print this message and exit"); + cli.add_file_arg("bin_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + bool alpha = cli.get_bools()["alpha"]; + bool hfst = cli.get_bools()["hfst"]; + + FILE* input = openInBinFile(cli.get_files()[0]); + UFILE* output = openOutTextFile(cli.get_files()[1]); Alphabet alphabet; std::set alphabetic_chars; diff --git a/lttoolbox/lt_proc.cc b/lttoolbox/lt_proc.cc index a227a64c..9b2af639 100644 --- a/lttoolbox/lt_proc.cc +++ b/lttoolbox/lt_proc.cc @@ -16,74 +16,9 @@ */ #include #include -#include +#include #include -#include -#include -#include -#include - -void endProgram(char *name) -{ - std::cout << basename(name) << ": process a stream with a letter transducer" << std::endl; - std::cout << "USAGE: " << basename(name) << " [ -a | -b | -c | -d | -e | -g | -n | -p | -x | -s | -t | -v | -h | -z | -w ] [-W] [-N N] [-L N] [ -i icx_file ] [ -r rcx_file ] fst_file [input_file [output_file]]" << std::endl; - std::cout << "Options:" << std::endl; -#if HAVE_GETOPT_LONG - std::cout << " -a, --analysis: morphological analysis (default behavior)" << std::endl; - std::cout << " -b, --bilingual: lexical transfer" << std::endl; - std::cout << " -c, --case-sensitive: use the literal case of the incoming characters" << std::endl; - std::cout << " -d, --debugged-gen morph. generation with all the stuff" << std::endl; - std::cout << " -e, --decompose-nouns: Try to decompound unknown words" << std::endl; - std::cout << " -g, --generation: morphological generation" << std::endl; - std::cout << " -i, --ignored-chars: specify file with characters to ignore" << std::endl; - std::cout << " -r, --restore-chars: specify file with characters to diacritic restoration" << std::endl; - std::cout << " -l, --tagged-gen: morphological generation keeping lexical forms" << std::endl; - std::cout << " -m, --tagged-nm-gen: same as -l but without unknown word marks" << std::endl; - std::cout << " -n, --non-marked-gen morph. generation without unknown word marks" << std::endl; - std::cout << " -o, --surf-bilingual: lexical transfer with surface forms" << std::endl; - std::cout << " -p, --post-generation: post-generation" << std::endl; - std::cout << " -x, --inter-generation: inter-generation" << std::endl; - std::cout << " -s, --sao: SAO annotation system input processing" << std::endl; - std::cout << " -t, --transliteration: apply transliteration dictionary" << std::endl; - std::cout << " -v, --version: version" << std::endl; - std::cout << " -z, --null-flush: flush output on the null character " << std::endl; - std::cout << " -w, --dictionary-case: use dictionary case instead of surface case" << std::endl; - std::cout << " -C, --careful-case: use dictionary case if present, else surface" << std::endl; - std::cout << " -I, --no-default-ignore: skips loading the default ignore characters" << std::endl; - std::cout << " -W, --show-weights: Print final analysis weights (if any)" << std::endl; - std::cout << " -N, --analyses: Output no more than N analyses (if the transducer is weighted, the N best analyses)" << std::endl; - std::cout << " -L, --weight-classes: Output no more than N best weight classes (where analyses with equal weight constitute a class)" << std::endl; - std::cout << " -h, --help: show this help" << std::endl; -#else - std::cout << " -a: morphological analysis (default behavior)" << std::endl; - std::cout << " -b: lexical transfer" << std::endl; - std::cout << " -c: use the literal case of the incoming characters" << std::endl; - std::cout << " -d: morph. generation with all the stuff" << std::endl; - std::cout << " -e: try to decompose unknown words as compounds" << std::endl; - std::cout << " -g: morphological generation" << std::endl; - std::cout << " -i: specify file with characters to ignore" << std::endl; - std::cout << " -r: specify file with characters to diacritic restoration" << std::endl; - std::cout << " -l: morphological generation keeping lexical forms" << std::endl; - std::cout << " -n: morph. generation without unknown word marks" << std::endl; - std::cout << " -o: lexical transfer with surface forms" << std::endl; - std::cout << " -p: post-generation" << std::endl; - std::cout << " -x: inter-generation" << std::endl; - std::cout << " -s: SAO annotation system input processing" << std::endl; - std::cout << " -t: apply transliteration dictionary" << std::endl; - std::cout << " -v: version" << std::endl; - std::cout << " -z: flush output on the null character " << std::endl; - std::cout << " -C: use dictionary case if present, else surface" << std::endl; - std::cout << " -W: Print final analysis weights (if any)" << std::endl; - std::cout << " -N: Output no more than N analyses" << std::endl; - std::cout << " -L: Output no more than N best weight classes" << std::endl; - std::cout << " -I: skips loading the default ignore characters" << std::endl; - std::cout << " -w: use dictionary case instead of surface case" << std::endl; - std::cout << " -h: show this help" << std::endl; -#endif - exit(EXIT_FAILURE); -} - void checkValidity(FSTProcessor const &fstp) { if(!fstp.valid()) @@ -96,200 +31,142 @@ int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - int cmd = 0; - int maxAnalyses; - int maxWeightClasses; - FSTProcessor fstp; - -#if HAVE_GETOPT_LONG - static struct option long_options[]= - { - {"analysis", 0, 0, 'a'}, - {"bilingual", 0, 0, 'b'}, - {"surf-bilingual", 0, 0, 'o'}, - {"generation", 0, 0, 'g'}, - {"ignored-chars", 1, 0, 'i'}, - {"restore-chars", 1, 0, 'r'}, - {"non-marked-gen", 0, 0, 'n'}, - {"debugged-gen", 0, 0, 'd'}, - {"tagged-gen", 0, 0, 'l'}, - {"tagged-nm-gen", 0, 0, 'm'}, - {"post-generation", 0, 0, 'p'}, - {"inter-generation", 0, 0, 'x'}, - {"sao", 0, 0, 's'}, - {"transliteration", 0, 0, 't'}, - {"null-flush", 0, 0, 'z'}, - {"dictionary-case", 0, 0, 'w'}, - {"version", 0, 0, 'v'}, - {"case-sensitive", 0, 0, 'c'}, - {"careful-case", 0, 0, 'C'}, - {"no-default-ignore", 0, 0, 'I'}, - {"show-weights", 0, 0, 'W'}, - {"analyses", 1, 0, 'N'}, - {"weight-classes", 1, 0, 'L'}, - {"help", 0, 0, 'h'} - }; -#endif + CLI cli("process a stream with a letter transducer", PACKAGE_VERSION); + cli.add_file_arg("fst_file", false); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.add_bool_arg('a', "analysis", "morphological analysis (default behavior)"); + cli.add_bool_arg('b', "bilingual", "lexical transfer"); + cli.add_bool_arg('c', "case-sensitive", "use the literal case of the incoming characters"); + cli.add_bool_arg('d', "debugged-gen", "morph. generation with all the stuff"); + cli.add_bool_arg('e', "decompose-nouns", "Try to decompound unknown words"); + cli.add_bool_arg('g', "generation", "morphological generation"); + cli.add_str_arg('i', "ignored-chars", "specify file with characters to ignore", "icx_file"); + cli.add_str_arg('r', "restore-chars", "specify file with characters to diacritic restoration", "rcx_file"); + cli.add_bool_arg('l', "tagged-gen", "morphological generation keeping lexical forms"); + cli.add_bool_arg('m', "tagged-nm-gen", "same as -l but without unknown word marks"); + cli.add_bool_arg('n', "non-marked-gen", "morph. generation without unknown word marks"); + cli.add_bool_arg('o', "surf-bilingual", "lexical transfer with surface forms"); + cli.add_bool_arg('p', "post-generation", "post-generation"); + cli.add_bool_arg('x', "inter-generation", "inter-generation"); + cli.add_bool_arg('s', "sao", "SAO annotation system input processing"); + cli.add_bool_arg('t', "transliteration", "apply transliteration dictionary"); + cli.add_bool_arg('v', "version", "version"); + cli.add_bool_arg('z', "null-flush", "flush output on the null character"); + cli.add_bool_arg('w', "dictionary-case", "use dictionary case instead of surface"); + cli.add_bool_arg('C', "careful-case", "use dictionary case if present, else surface"); + cli.add_bool_arg('I', "no-default-ignore", "skips loading the default ignore characters"); + cli.add_bool_arg('W', "show-weights", "Print final analysis weights (if any)"); + cli.add_str_arg('N', "analyses", "Output no more than N analyses (if the transducer is weighted, the N best analyses)", "N"); + cli.add_str_arg('L', "weight-classes", "Output no more than N best weight classes (where analyses with equal weight constitute a class)", "N"); + cli.add_bool_arg('h', "help", "show this help"); + cli.parse_args(argc, argv); + FSTProcessor fstp; GenerationMode bilmode = gm_unknown; - // more than one option sets generation mode, but -gb also sets gm_unknown - bool really_g = false; - while(true) - { -#if HAVE_GETOPT_LONG - int option_index; - int c = getopt_long(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h", long_options, &option_index); -#else - int c = getopt(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h"); -#endif - - if(c == -1) - { - break; - } - - switch(c) - { - case 'c': - fstp.setCaseSensitiveMode(true); - break; - - case 'i': - fstp.setIgnoredChars(true); - fstp.parseICX(optarg); - break; - - case 'r': - fstp.setRestoreChars(true); - fstp.parseRCX(optarg); - fstp.setUseDefaultIgnoredChars(false); - break; + char cmd = 0; - case 'I': - fstp.setUseDefaultIgnoredChars(false); - break; - - case 'W': - fstp.setDisplayWeightsMode(true); - break; - - case 'N': - maxAnalyses = atoi(optarg); - if (maxAnalyses < 1) - { - std::cerr << "Invalid or no argument for analyses count" << std::endl; - exit(EXIT_FAILURE); - } - fstp.setMaxAnalysesValue(maxAnalyses); - break; - - case 'L': - maxWeightClasses = atoi(optarg); - if (maxWeightClasses < 1) - { - std::cerr << "Invalid or no argument for weight class count" << std::endl; - exit(EXIT_FAILURE); - } - fstp.setMaxWeightClassesValue(maxWeightClasses); - break; - - case 'e': - case 'a': - case 'b': - case 'o': - case 'g': - case 'p': - case 'x': - case 't': - case 's': - if(cmd == 0) - { - cmd = c; - if (cmd == 'g') really_g = true; - } - else if(cmd == 'g' && c == 'b') { - // "lt-proc -g -b generador.bin" should run biltrans, keeping unknown-marks - if (really_g) bilmode = gm_unknown; - cmd = 'b'; - } - else - { - endProgram(argv[0]); - } - break; - - case 'd': - if (cmd == 0) cmd = 'g'; - bilmode = gm_all; - break; - - case 'l': - if (cmd == 0) cmd = 'g'; - bilmode = gm_tagged; - break; - - case 'm': - if (cmd == 0) cmd = 'g'; - bilmode = gm_tagged_nm; - break; - - case 'n': - if (cmd == 0) cmd = 'g'; - bilmode = gm_clean; - break; - - case 'C': - if (cmd == 0) cmd = 'g'; - bilmode = gm_carefulcase; - break; - - case 'z': - fstp.setNullFlush(true); - break; + auto args = cli.get_bools(); + if (args["analysis"]) { + cmd = 'a'; + } + if (args["bilingual"]) { + if (cmd) cli.print_usage(); + cmd = 'b'; + } + if (args["surf-bilingual"]) { + if (cmd && cmd != 'b') cli.print_usage(); + cmd = 'b'; + fstp.setBiltransSurfaceForms(true); + } + if (args["generation"]) { + if (cmd && cmd != 'b') cli.print_usage(); + cmd = 'g'; + } + if (args["decompose-nouns"]) { + if (cmd) cli.print_usage(); + cmd = 'e'; + } + if (args["post-generation"]) { + if (cmd) cli.print_usage(); + cmd = 'p'; + } + if (args["inter-generation"] || args["transliteration"]) { + if (cmd) cli.print_usage(); + cmd = 't'; + } + if (args["sao"]) { + if (cmd) cli.print_usage(); + cmd = 's'; + } - case 'w': - fstp.setDictionaryCaseMode(true); - break; + if (args["debugged-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_all; + } + if (args["tagged-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_tagged; + } + if (args["tagged-nm-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_tagged_nm; + } + if (args["non-marked-gen"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_clean; + } + if (args["careful-case"]) { + if (!cmd) cmd = 'g'; + bilmode = gm_carefulcase; + } - case 'v': - std::cout << basename(argv[0]) << " version " << PACKAGE_VERSION << std::endl; - exit(EXIT_SUCCESS); - break; - case 'h': - default: - endProgram(argv[0]); - break; + fstp.setCaseSensitiveMode(cli.get_bools()["case-sensitive"]); + fstp.setUseDefaultIgnoredChars(!cli.get_bools()["no-default-ignore"]); + fstp.setDisplayWeightsMode(cli.get_bools()["show-weights"]); + fstp.setNullFlush(cli.get_bools()["null-flush"]); + fstp.setDictionaryCaseMode(cli.get_bools()["dictionary-case"]); + + auto strs = cli.get_strs(); + if (strs.find("ignored-chars") != strs.end()) { + fstp.setIgnoredChars(true); + for (auto& it : strs["ignored-chars"]) { + fstp.parseICX(it); } } - - InputFile input; - UFILE* output = u_finit(stdout, NULL, NULL); - - if(optind == (argc - 3)) - { - FILE* in = openInBinFile(argv[optind]); - input.open_or_exit(argv[optind+1]); - output = openOutTextFile(argv[optind+2]); - fstp.load(in); - fclose(in); + if (strs.find("restore-chars") != strs.end()) { + fstp.setRestoreChars(true); + fstp.setUseDefaultIgnoredChars(false); + for (auto& it : strs["restore-chars"]) { + fstp.parseRCX(it); + } } - else if(optind == (argc -2)) - { - FILE* in = openInBinFile(argv[optind]); - input.open_or_exit(argv[optind+1]); - fstp.load(in); - fclose(in); + if (strs.find("analyses") != strs.end()) { + int n = atoi(strs["analyses"].back().c_str()); + if (n < 1) { + std::cerr << "Invalid or no argument for analyses count" << std::endl; + exit(EXIT_FAILURE); + } + fstp.setMaxAnalysesValue(n); } - else if(optind == (argc - 1)) - { - FILE* in = openInBinFile(argv[optind]); - fstp.load(in); - fclose(in); + if (strs.find("weight-classes") != strs.end()) { + int n = atoi(strs["weight-classes"].back().c_str()); + if (n < 1) { + std::cerr << "Invalid or no argument for weight class count" << std::endl; + exit(EXIT_FAILURE); + } + fstp.setMaxWeightClassesValue(n); } - else - { - endProgram(argv[0]); + + FILE* in = openInBinFile(cli.get_files()[0]); + fstp.load(in); + fclose(in); + + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); } + UFILE* output = openOutTextFile(cli.get_files()[2]); try { @@ -307,12 +184,6 @@ int main(int argc, char *argv[]) fstp.postgeneration(input, output); break; - case 'x': - fstp.initPostgeneration(); - checkValidity(fstp); - fstp.intergeneration(input, output); - break; - case 's': fstp.initAnalysis(); checkValidity(fstp); @@ -325,13 +196,6 @@ int main(int argc, char *argv[]) fstp.transliteration(input, output); break; - case 'o': - fstp.initBiltrans(); - checkValidity(fstp); - fstp.setBiltransSurfaceForms(true); - fstp.bilingual(input, output, bilmode); - break; - case 'b': fstp.initBiltrans(); checkValidity(fstp); diff --git a/lttoolbox/lt_restrict.cc b/lttoolbox/lt_restrict.cc new file mode 100644 index 00000000..987afa09 --- /dev/null +++ b/lttoolbox/lt_restrict.cc @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2022 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include + +void get_symbol(const std::string& s, Alphabet& alpha, const char* prefix, + sorted_vector& vec) +{ + UString t; + t += '<'; + t += to_ustring(prefix); + t += ':'; + t += to_ustring(s.c_str()); + t += '>'; + if (alpha.isSymbolDefined(t)) { + vec.insert(alpha(alpha(t), alpha(t))); + } +} + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("remove paths from a transducer", PACKAGE_VERSION); + cli.add_bool_arg('m', "minimise", "minimise transducers after deleting paths"); + cli.add_str_arg('v', "var", "set language variant", "VAR"); + cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); + cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); + cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); + cli.add_file_arg("lr | rl", false); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); + + std::string dir = cli.get_files()[0]; + if (dir == "lr") dir = "LR"; + else if (dir == "rl") dir = "RL"; + FILE* input = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); + + Alphabet alpha; + std::set letters; + std::map trans; + readTransducerSet(input, letters, alpha, trans); + + sorted_vector keep; + sorted_vector drop; + bool has_var = false; + get_symbol(dir, alpha, "r", keep); + for (auto& it : cli.get_strs()["var"]) { + get_symbol(it, alpha, "v", keep); + has_var = true; + } + for (auto& it : cli.get_strs()["alt"]) { + get_symbol(it, alpha, "alt", keep); + } + for (auto& it : cli.get_strs()["var-left"]) { + get_symbol(it, alpha, "vl", keep); + } + for (auto& it : cli.get_strs()["var-right"]) { + get_symbol(it, alpha, "vr", keep); + } + + for (int32_t i = 1; i <= alpha.size(); i++) { + UString t; + alpha.getSymbol(t, -i); + if (StringUtils::startswith(t, ". */ #include -#include #include - -#include -#include -#include -#include - - -[[noreturn]] -void endProgram(char *name) -{ - std::cout << basename(name) << ": process a stream with a letter transducer" << std::endl; - std::cout << "USAGE: " << basename(name) << " fst_file [input_file [output_file]]" << std::endl; - exit(EXIT_FAILURE); -} - -void checkValidity(FSTProcessor const &fstp) -{ - if(!fstp.valid()) - { - exit(EXIT_FAILURE); - } -} +#include +#include int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + CLI cli("process a stream with a letter transducer"); + cli.add_file_arg("fst_file", false); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.parse_args(argc, argv); - InputFile input; - UFILE* output = u_finit(stdout, NULL, NULL); FSTProcessor fstp; - FILE *aux; + FILE* aux = openInBinFile(cli.get_files()[0]); + fstp.load(aux); + fclose(aux); + fstp.initTMAnalysis(); + if (!fstp.valid()) { + return EXIT_FAILURE; + } - switch(argc) - { - case 4: - output = u_fopen(argv[3], "wb", NULL, NULL); - if(!output) - { - endProgram(argv[0]); - } - // follow - case 3: - if (!input.open(argv[2])) { - endProgram(argv[0]); - } - // follow - case 2: - aux = fopen(argv[1], "rb"); - if(!aux) - { - endProgram(argv[0]); - } - fstp.load(aux); - fclose(aux); - break; - default: - endProgram(argv[0]); - break; + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[1].c_str()); } + UFILE* output = openOutTextFile(cli.get_files()[2].c_str()); - fstp.initTMAnalysis(); - checkValidity(fstp); fstp.tm_analysis(input, output); u_fclose(output); diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index 6052f873..17271f39 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -16,22 +16,9 @@ */ #include #include - +#include #include - -#include #include -#include - -void endProgram(char *name) -{ - if(name != NULL) - { - std::cout << basename(name) << " v" << PACKAGE_VERSION <<": trim a transducer to another transducer" << std::endl; - std::cout << "USAGE: " << basename(name) << " analyser_bin_file bidix_bin_file trimmed_bin_file " << std::endl; - } - exit(EXIT_FAILURE); -} void trim(FILE* file_mono, FILE* file_bi, FILE* file_out) @@ -92,23 +79,22 @@ trim(FILE* file_mono, FILE* file_bi, FILE* file_out) exit(EXIT_FAILURE); } - writeTransducerSet(file_out, UString(letters_mono.begin(), letters_mono.end()), - alph_mono, trans_trim); + writeTransducerSet(file_out, letters_mono, alph_mono, trans_trim); } int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - - if(argc != 4) - { - endProgram(argv[0]); - } - - FILE* analyser = openInBinFile(argv[1]); - FILE* bidix = openInBinFile(argv[2]); - FILE* output = openOutBinFile(argv[3]); + CLI cli("trim a transducer to another transducer", PACKAGE_VERSION); + cli.add_file_arg("analyser_bin_file", false); + cli.add_file_arg("bidix_bin_file"); + cli.add_file_arg("trimmed_bin_file"); + cli.parse_args(argc, argv); + + FILE* analyser = openInBinFile(cli.get_files()[0]); + FILE* bidix = openInBinFile(cli.get_files()[1]); + FILE* output = openOutBinFile(cli.get_files()[2]); trim(analyser, bidix, output); diff --git a/lttoolbox/regexp_compiler.cc b/lttoolbox/regexp_compiler.cc index 629acba6..18c7beec 100644 --- a/lttoolbox/regexp_compiler.cc +++ b/lttoolbox/regexp_compiler.cc @@ -18,21 +18,16 @@ #include #include +#include -RegexpCompiler::RegexpCompiler() : -token(0), -index(0), -alphabet(0), -state(0), -letter(0), -postop(0), -default_weight(0.0000) +#define FIN_FICHERO INT_MAX + +RegexpCompiler::RegexpCompiler() { } RegexpCompiler::~RegexpCompiler() { - destroy(); } RegexpCompiler::RegexpCompiler(RegexpCompiler const &rec) @@ -45,7 +40,6 @@ RegexpCompiler::operator =(RegexpCompiler const &rec) { if(this != &rec) { - destroy(); copy(rec); } @@ -66,11 +60,6 @@ RegexpCompiler::copy(RegexpCompiler const &rec) default_weight = rec.default_weight; } -void -RegexpCompiler::destroy() -{ -} - bool RegexpCompiler::isReserved(int const t) { diff --git a/lttoolbox/regexp_compiler.h b/lttoolbox/regexp_compiler.h index dbe79e3d..e61925c4 100644 --- a/lttoolbox/regexp_compiler.h +++ b/lttoolbox/regexp_compiler.h @@ -17,7 +17,6 @@ #ifndef _REGEXP_COMPILER_ #define _REGEXP_COMPILER_ -#include #include #include @@ -25,8 +24,6 @@ #include #include -#define FIN_FICHERO - 1 - /** * Compiler that builds a transducer to identify regular expressions. This * compiler is a recursive descendent parser (RDP). @@ -37,7 +34,7 @@ class RegexpCompiler /** * Last token */ - int token; + int token = 0; /** * Input string @@ -47,12 +44,12 @@ class RegexpCompiler /** * Location in the input string */ - size_t index; + size_t index = 0; /** * Alphabet to encode symbols */ - Alphabet *alphabet; + Alphabet *alphabet = nullptr; /** * Transducer to store analysis @@ -62,22 +59,22 @@ class RegexpCompiler /** * Current state */ - int state; + int state = 0; /** * Current letter */ - int letter; + int letter = 0; /** * Post-operator: '+', '?', '*' */ - UChar32 postop; + UChar32 postop = '\0'; /** * Default value of weight */ - double default_weight; + double default_weight = 0.0000; /** * @@ -90,11 +87,6 @@ class RegexpCompiler */ void copy(RegexpCompiler const &rec); - /** - * Destroy method - */ - void destroy(); - /** * RDP top function */ diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc index 59b30a0e..34d0974a 100644 --- a/lttoolbox/string_utils.cc +++ b/lttoolbox/string_utils.cc @@ -257,6 +257,13 @@ StringUtils::caseequal(const UString& a, const UString& b) return (cmp == 0); } +bool +StringUtils::startswith(const UString& str, const UString& prefix) +{ + return (prefix.size() <= str.size() && + str.substr(0, prefix.size()) == prefix); +} + bool StringUtils::endswith(const UString& str, const UString& suffix) { diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h index adf2d29d..7c2771fb 100644 --- a/lttoolbox/string_utils.h +++ b/lttoolbox/string_utils.h @@ -34,6 +34,7 @@ class StringUtils { static bool caseequal(const UString& a, const UString& b); + static bool startswith(const UString& str, const UString& prefix); static bool endswith(const UString& str, const UString& suffix); static UString merge_wblanks(const UString& w1, const UString& w2); diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 3dd0abcd..dbb2c31e 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -1345,3 +1345,48 @@ Transducer::invert(Alphabet& alpha) } transitions.swap(tmp_trans); } + +void +Transducer::deleteSymbols(const sorted_vector& syms) +{ + for (auto& state : transitions) { + for (auto& sym : syms) { + state.second.erase(sym); + } + } +} + +void +Transducer::epsilonizeSymbols(const sorted_vector& syms) +{ + for (auto& state: transitions) { + for (auto& sym : syms) { + auto pr = state.second.equal_range(sym); + for (auto it = pr.first; it != pr.second; it++) { + state.second.insert(std::make_pair(0, it->second)); + } + state.second.erase(sym); + } + } +} + +void +Transducer::applyACX(Alphabet& alpha, + const std::map>& acx) +{ + for (auto& state : transitions) { + std::vector>> to_insert; + for (auto& it : state.second) { + auto pr = alpha.decode(it.first); + auto loc = acx.find(pr.first); + if (loc != acx.end()) { + for (auto& sym : loc->second) { + to_insert.push_back(std::make_pair(alpha(sym, pr.second), it.second)); + } + } + } + for (auto& it : to_insert) { + state.second.insert(it); + } + } +} diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 8c1aeded..2ad56b37 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -424,6 +424,23 @@ class Transducer * Invert all transitions so x:y becomes y:x (this will update alpha). */ void invert(Alphabet& alpha); + + /** + * Deletes all transitions with a symbol pair in syms + */ + void deleteSymbols(const sorted_vector& syms); + + /** + * For every transition with a symbol pair in syms, + * change the symbol pair to epsilon + */ + void epsilonizeSymbols(const sorted_vector& syms); + + /** + * Given a map of symbols, x:[a,b,c], + * expand all x:y transitions to x:y, a:y, b:y, c:y + */ + void applyACX(Alphabet& alpha, const std::map>& acx); }; #endif diff --git a/lttoolbox/xml_parse_util.cc b/lttoolbox/xml_parse_util.cc index e5d132ff..a59d3f12 100644 --- a/lttoolbox/xml_parse_util.cc +++ b/lttoolbox/xml_parse_util.cc @@ -20,6 +20,16 @@ #include #include +xmlTextReaderPtr +XMLParseUtil::open_or_exit(const char* fname) +{ + xmlTextReaderPtr reader = xmlReaderForFile(fname, NULL, 0); + if (reader == NULL) { + std::cerr << "Error: cannot open '" << fname << "' for reading." << std::endl; + exit(EXIT_FAILURE); + } + return reader; +} UString XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name) @@ -88,3 +98,12 @@ XMLParseUtil::readValueInto32(xmlTextReaderPtr reader, std::vector& vec vec.reserve(vec.size() + sz); utf8::utf8to32(val, val+sz, std::back_inserter(vec)); } + +bool +XMLParseUtil::allBlanks(xmlTextReaderPtr reader) +{ + for (auto& c : readValue(reader)) { + if (!u_isspace(c)) return false; + } + return true; +} diff --git a/lttoolbox/xml_parse_util.h b/lttoolbox/xml_parse_util.h index b558d1c7..2b9dc823 100644 --- a/lttoolbox/xml_parse_util.h +++ b/lttoolbox/xml_parse_util.h @@ -29,6 +29,8 @@ class XMLParseUtil { public: + static xmlTextReaderPtr open_or_exit(const char* fname); + /* If attrib does not exist (or other error), returns an empty string: */ static UString attrib(xmlTextReaderPtr reader, UString const &name); @@ -40,6 +42,8 @@ class XMLParseUtil static UString readName(xmlTextReaderPtr reader); static UString readValue(xmlTextReaderPtr reader); static void readValueInto32(xmlTextReaderPtr reader, std::vector& vec); + + static bool allBlanks(xmlTextReaderPtr reader); }; #endif diff --git a/tests/data/basic.acx b/tests/data/basic.acx new file mode 100644 index 00000000..c5c1ae13 --- /dev/null +++ b/tests/data/basic.acx @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/variants.dix b/tests/data/variants.dix new file mode 100644 index 00000000..5c191de9 --- /dev/null +++ b/tests/data/variants.dix @@ -0,0 +1,30 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + + +
+

abcab

+

abab

+

ababbb

+

yy

+

nn

+
+ +
+

jgjg

+

jhjh

+

kgkg

+
+ +
diff --git a/tests/lt_apply_acx/__init__.py b/tests/lt_apply_acx/__init__.py new file mode 100644 index 00000000..4e4aab38 --- /dev/null +++ b/tests/lt_apply_acx/__init__.py @@ -0,0 +1,19 @@ +import unittest +from basictest import ProcTest + +class AcxTest(unittest.TestCase, ProcTest): + dix = 'data/minimal-mono.dix' + acx = 'data/basic.acx' + procdir = 'lr' + inputs = ['abc', 'ábc', 'äbc'] + expectedOutputs = ['^abc/ab$', + '^ábc/ab$', + '^äbc/ab$'] + + def compileTest(self, tmpd): + ret = self.compileDix(self.procdir, self.dix, + binName=tmpd+'/plain.bin') + if not ret: return ret + self.callProc('lt-apply-acx', + [tmpd+'/plain.bin', self.acx, tmpd+'/compiled.bin']) + return True diff --git a/tests/lt_comp/__init__.py b/tests/lt_comp/__init__.py index e51faca4..1e7fe4aa 100644 --- a/tests/lt_comp/__init__.py +++ b/tests/lt_comp/__init__.py @@ -70,3 +70,29 @@ class CompLSX(unittest.TestCase, PrintTest): 13 14 ε ε 0.000000\t 14 0.000000 ''' + +class RestrictTest(unittest.TestCase, ProcTest): + procdix = 'data/variants.dix' + procdir = 'lr' + restrictflags = [] + inputs = ['abc', 'ab'] + expectedOutputs = ['^abc/ab$', '^ab/*ab$'] + + def compileTest(self, tmpd): + ret = self.compileDix('u', self.procdix, binName=tmpd+'/uni.bin') + if not ret: return ret + self.callProc('lt-restrict', + [self.procdir, tmpd+'/uni.bin', tmpd+'/compiled.bin'], + self.restrictflags) + +class RestrictRL1(RestrictTest): + procdir = 'rl' + restrictflags = ['-v', 'gascon'] + inputs = ['abc', 'ab'] + expectedOutputs = ['^abc/*abc$', '^ab/ab$'] + +class RestrictRL2(RestrictTest): + procdir = 'rl' + restrictflags = ['-v', 'oci'] + inputs = ['abc', 'ab'] + expectedOutputs = ['^abc/*abc$', '^ab/abbb$'] diff --git a/tests/run_tests.py b/tests/run_tests.py index 8754b3ed..adb432b1 100755 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -11,7 +11,7 @@ os.environ['LTTOOLBOX_PATH'] = sys.argv[1] modules = ['lt_proc', 'lt_trim', 'lt_print', 'lt_comp', 'lt_append', - 'lt_paradigm', 'lt_expand'] + 'lt_paradigm', 'lt_expand', 'lt_apply_acx'] if __name__ == "__main__": os.chdir(os.path.dirname(__file__))