diff --git a/configure.ac b/configure.ac index 1a329446..5e4d2b17 100644 --- a/configure.ac +++ b/configure.ac @@ -46,7 +46,12 @@ AC_CHECK_LIB(xml2, xmlReaderForFile) # Checks for header files. AC_HEADER_STDC -AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h]) +AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h string_view]) + +have_sv="" +AC_CHECK_HEADERS([string_view], [have_sv="-DHAVE_STRING_VIEW"], [have_sv=""]) +AC_SUBST([have_sv]) + AC_CHECK_HEADER([utf8cpp/utf8.h], [CPPFLAGS="-I/usr/include/utf8cpp/ $CPPFLAGS"], [ AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) ]) diff --git a/lttoolbox.pc.in b/lttoolbox.pc.in index 9ecf8f51..7b435232 100644 --- a/lttoolbox.pc.in +++ b/lttoolbox.pc.in @@ -7,4 +7,4 @@ Name: lttoolbox Description: Augmented letter transducer tools for natural language processing Version: @VERSION@ Libs: -L${libdir} -llttoolbox@VERSION_MAJOR@ -Cflags: -I${includedir}/lttoolbox-@VERSION_API@ +Cflags: -I${includedir}/lttoolbox-@VERSION_API@ @have_sv@ diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 4f031cd0..38e92fec 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,14 +1,14 @@ -h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ - deserialiser.h entry_token.h expander.h file_utils.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ - transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ +h_sources = alphabet.h alphabet_exe.h att_compiler.h binary_headers.h buffer.h compiler.h compression.h \ + deserialiser.h endian_util.h entry_token.h expander.h file_utils.h fst_processor.h input_file.h lt_locale.h \ + match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h old_binary.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h symbol_iter.h \ + transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h sorted_vector.hpp -cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ +cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc binary_headers.cc compression.cc entry_token.cc \ expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ - match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ + match_node.cc match_state.cc match_state2.cc node.cc old_binary.cc pattern_list.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc symbol_iter.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 334213b8..7f48ff83 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -110,12 +112,6 @@ Alphabet::operator()(UString const &s) const return it->second; } -bool -Alphabet::isSymbolDefined(UString const &s) -{ - return slexic.find(s) != slexic.end(); -} - bool Alphabet::isSymbolDefined(const UString& s) const { @@ -129,23 +125,20 @@ Alphabet::size() const } void -Alphabet::write(FILE *output) +Alphabet::write(FILE *output) const { // First, we write the taglist Compression::multibyte_write(slexicinv.size(), output); // taglist size - for(size_t i = 0, limit = slexicinv.size(); i < limit; i++) - { - Compression::string_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output); + for (auto& it : slexicinv) { + Compression::string_write(it.substr(1, it.size()-2), output); } - // Then we write the list of pairs // All numbers are biased + slexicinv.size() to be positive or zero size_t bias = slexicinv.size(); Compression::multibyte_write(spairinv.size(), output); - for(size_t i = 0, limit = spairinv.size(); i != limit; i++) - { - Compression::multibyte_write(spairinv[i].first + bias, output); - Compression::multibyte_write(spairinv[i].second + bias, output); + for (auto& it : spairinv) { + Compression::multibyte_write(it.first + bias, output); + Compression::multibyte_write(it.second + bias, output); } } @@ -157,26 +150,20 @@ Alphabet::read(FILE *input) a_new.spair.clear(); // Reading of taglist - int32_t tam = Compression::multibyte_read(input); - std::map tmp; - while(tam > 0) - { - tam--; - UString mytag = "<"_u; - mytag += Compression::string_read(input); - mytag += ">"_u; + for (uint64_t tam = OldBinary::read_int(input, true); tam > 0; tam--) { + UString mytag; + mytag += '<'; + OldBinary::read_ustr(input, mytag, true); + mytag += '>'; a_new.slexicinv.push_back(mytag); a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics } // Reading of pairlist size_t bias = a_new.slexicinv.size(); - tam = Compression::multibyte_read(input); - while(tam > 0) - { - tam--; - int32_t first = Compression::multibyte_read(input); - int32_t second = Compression::multibyte_read(input); + for (uint64_t tam = OldBinary::read_int(input, true); tam > 0; tam--) { + int32_t first = OldBinary::read_int(input, true); + int32_t second = OldBinary::read_int(input, true); std::pair tmp(first - bias, second - bias); int32_t spair_size = a_new.spair.size(); a_new.spair[tmp] = spair_size; @@ -186,6 +173,30 @@ Alphabet::read(FILE *input) *this = a_new; } +void +Alphabet::write_mmap(FILE* output, StringWriter& sw) const +{ + write_le_64(output, slexicinv.size()); + for (auto& it : slexicinv) { + StringRef r = sw.add(it); + write_le_32(output, r.start); + write_le_32(output, r.count); + } +} + +void +Alphabet::read_mmap(FILE* input, StringWriter& sw) +{ + int64_t count = read_le_64(input); + for (int64_t i = 0; i < count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString t = UString{sw.get(s, c)}; + slexicinv.push_back(t); + slexic[t] = -i-1; + } +} + void Alphabet::serialise(std::ostream &serialised) const { @@ -210,6 +221,30 @@ Alphabet::deserialise(std::istream &serialised) } } +void +Alphabet::read_serialised(FILE* in) +{ + slexicinv.clear(); + slexic.clear(); + spairinv.clear(); + spair.clear(); + uint64_t len = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < len; i++) { + UString t; + OldBinary::read_ustr(in, t, false); + slexicinv.push_back(t); + slexic[t] = -(int)i - 1; + } + len = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < len; i++) { + int32_t a = OldBinary::read_int(in, false); + int32_t b = OldBinary::read_int(in, false); + auto p = make_pair(a, b); + spairinv.push_back(p); + spair[p] = i; + } +} + void Alphabet::writeSymbol(int32_t const symbol, UFILE *output) const { @@ -307,6 +342,12 @@ Alphabet::createLoopbackSymbols(std::set &symbols, Alphabet &basis, Sid } } +std::vector& +Alphabet::getTags() +{ + return slexicinv; +} + std::vector Alphabet::tokenize(const UString& str) const { diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 373dece8..62b8db06 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -23,6 +23,7 @@ #include #include #include +#include #include using namespace icu; @@ -114,9 +115,6 @@ class Alphabet * @param s symbol * @return true if defined */ - bool isSymbolDefined(UString const &s); - // TODO: This should always be const. - // But binary compatibility, so have 2 copies for now. bool isSymbolDefined(UString const &s) const; /** @@ -129,7 +127,7 @@ class Alphabet * Write method. * @param output output stream. */ - void write(FILE *output); + void write(FILE *output) const; /** * Read method. @@ -137,9 +135,14 @@ class Alphabet */ void read(FILE *input); + void write_mmap(FILE* output, StringWriter& sw) const; + void read_mmap(FILE* input, StringWriter& sw); + void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); + void read_serialised(FILE* in); + /** * Write a symbol enclosed by angle brackets in the output stream. * @param symbol symbol code. @@ -200,6 +203,11 @@ class Alphabet */ void createLoopbackSymbols(std::set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); + /** + * Return a reference to the array of tags + */ + std::vector& getTags(); + std::vector tokenize(const UString& str) const; bool sameSymbol(const int32_t tsym, const Alphabet& other, const int32_t osym, diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc new file mode 100644 index 00000000..7e575a0f --- /dev/null +++ b/lttoolbox/alphabet_exe.cc @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include +#include + +#include + +AlphabetExe::AlphabetExe(StringWriter* sw_) + : sw(sw_), tag_count(0), tags(nullptr) +{} + +AlphabetExe::~AlphabetExe() +{ + if (!mmapping) { + delete[] tags; + } +} + +void +AlphabetExe::read(FILE* input, bool mmap, bool compressed) +{ + if (mmap) { + tag_count = read_le_64(input); + tags = new StringRef[tag_count]; + for (uint64_t i = 0; i < tag_count; i++) { + tags[i].start = read_le_32(input); + tags[i].count = read_le_32(input); + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + } else { + tag_count = OldBinary::read_int(input, compressed); + tags = new StringRef[tag_count]; + for (uint32_t i = 0; i < tag_count; i++) { + UString tg; + if (compressed) { + tg += '<'; + OldBinary::read_ustr(input, tg, compressed); + tg += '>'; + } else { + OldBinary::read_ustr(input, tg, compressed); + } + tags[i] = sw->add(tg); + } + // has to be a separate loop, otherwise the string_views get + // invalidated when the StringWriter buffer expands + reindex(); + int pairs = OldBinary::read_int(input); + for (int i = 0; i < pairs; i++) { + OldBinary::read_int(input, compressed); + OldBinary::read_int(input, compressed); + } + } +} + +void* +AlphabetExe::init(void* ptr) +{ + mmapping = true; + // TODO: why is from_le_64 segfaulting here? + //tag_count = from_le_64(reinterpret_cast(ptr)[0]); + tag_count = reinterpret_cast(ptr)[0]; + tags = reinterpret_cast(ptr + sizeof(uint64_t)); + for (uint64_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + return ptr + sizeof(uint64_t) + tag_count*sizeof(StringRef); +} + +int32_t +AlphabetExe::operator()(UString_view sv) const +{ + auto it = symbol_map.find(sv); + if (it != symbol_map.end()) { + return it->second; + } else { + return 0; + } +} + +void +AlphabetExe::getSymbol(UString& result, int32_t symbol, bool uppercase) const +{ + if (symbol == 0) { + return; + } else if (symbol < 0) { + if (clearedSymbols.find(symbol) != clearedSymbols.end()) { + return; + } + int idx = -symbol-1; + if (idx < tag_count) { + result.append(sw->get(tags[idx])); + } else { + result.append(dynamic_symbols[idx-tag_count]); + } + } else if (uppercase) { + result += u_toupper(static_cast(symbol)); + } else { + result += static_cast(symbol); + } +} + +bool +AlphabetExe::isTag(const int32_t symbol) const +{ + return symbol < 0; +} + +void +AlphabetExe::clearSymbol(const int32_t symbol) +{ + if (symbol < 0) { + if (mmapping) { + clearedSymbols.insert(symbol); + } else { + tags[-symbol-1].start = 0; + tags[-symbol-1].count = 0; + } + } +} + +void +AlphabetExe::reindex() +{ + symbol_map.clear(); + for (uint64_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + int32_t n = -tag_count-1; + for (auto& ds : dynamic_symbols) { + symbol_map[ds] = n--; + } +} + +int32_t +AlphabetExe::lookupDynamic(const UString& symbol) +{ + int32_t ret; + auto it = symbol_map.find(symbol); + if (it == symbol_map.end()) { + if (dynamic_symbols.empty()) { + // should be able to usually avoid reindexing with this + dynamic_symbols.reserve(32); + } + ret = -tag_count -dynamic_symbols.size() -1; + bool rebuild = (dynamic_symbols.size() == dynamic_symbols.capacity()); + dynamic_symbols.push_back(symbol); + symbol_map[dynamic_symbols.back()] = ret; + if (rebuild) { + // moderately horrible, but that's what we get for invalidating + // all the views when dynamic_symbols gets reallocated + reindex(); + } + } else { + ret = it->second; + } + return ret; +} + +std::vector +AlphabetExe::tokenize(const UString& str) const +{ + std::vector ret; + size_t end = str.size(); + size_t i = 0; + UChar32 c; + while (i < end) { + U16_NEXT(str.c_str(), i, end, c); + if (c == '\\') { + } else if (c == '<') { + size_t j = i; + while (c != '>' && j < end) { + U16_NEXT(str.c_str(), j, end, c); + } + if (c == '>') { + ret.push_back(operator()(str.substr(i-1, j-i+1))); + i = j; + } + } else { + ret.push_back(static_cast(c)); + } + } + return ret; +} diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h new file mode 100644 index 00000000..8a693b93 --- /dev/null +++ b/lttoolbox/alphabet_exe.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_ALPHABET_EXE_ +#define _LT_ALPHABET_EXE_ + +#include +#include +#include +#include + +class AlphabetExe { +private: + StringWriter* sw; + uint64_t tag_count; + StringRef* tags; + std::map symbol_map; + bool mmapping = false; + // tags added at runtime - used by apertium-separable + std::vector dynamic_symbols; + // tags that should not be printed, such as + // used by clearSymbol() if we're mmapping since we can't edit the data + std::set clearedSymbols; +public: + AlphabetExe(StringWriter* sw_); + ~AlphabetExe(); + void read(FILE* in, bool mmap, bool compressed=true); + void* init(void* ptr); + int32_t operator()(UString_view sv) const; + void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; + bool isTag(const int32_t symbol) const; + void clearSymbol(const int32_t symbol); + // call this after StringWriter buffer gets updated + void reindex(); + // like operator() but add symbol to dynamic_symbols if not found + int32_t lookupDynamic(const UString& symbol); + std::vector tokenize(const UString& str) const; +}; + +#endif diff --git a/lttoolbox/binary_headers.cc b/lttoolbox/binary_headers.cc new file mode 100644 index 00000000..2eab19af --- /dev/null +++ b/lttoolbox/binary_headers.cc @@ -0,0 +1,22 @@ +#include + +#include +#include + +bool +readHeader(FILE* input, const char* expect_header, uint64_t& feats) +{ + feats = 0; + fpos_t pos; + if (fgetpos(input, &pos) == 0) { + char header[4]{}; + auto r = fread_unlocked(header, 1, 4, input); + if (r == 4 && strncmp(header, expect_header, 4) == 0) { + feats = read_le_64(input); + return true; + } else { + fsetpos(input, &pos); + } + } + return false; +} diff --git a/lttoolbox/binary_headers.h b/lttoolbox/binary_headers.h new file mode 100644 index 00000000..da48a6d3 --- /dev/null +++ b/lttoolbox/binary_headers.h @@ -0,0 +1,26 @@ +#ifndef _LT_BINARY_HEADERS_ +#define _LT_BINARY_HEADERS_ + +#include +#include + +// Global lttoolbox features +constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; +enum LT_FEATURES : uint64_t { + LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format + LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added + LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +// Invididual transducer features +constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; +enum TD_FEATURES : uint64_t { + TDF_WEIGHTS = (1ull << 0), + TDF_MMAP = (1ull << 1), + TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added + TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +bool readHeader(FILE* input, const char* expect_header, uint64_t& feats); + +#endif diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 73d3ad17..cf0ff5bc 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -16,10 +16,12 @@ */ #include #include +#include #include #include #include #include +#include #include #include @@ -208,7 +210,7 @@ Compiler::procAlphabet() bool space = true; for(unsigned int i = 0; i < letters.length(); i++) { - if(!u_isspace(letters.at(i))) + if(!u_isspace(letters[i])) { space = false; break; diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 237a5f9e..e9ffe1f7 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -23,22 +23,9 @@ #include #include #include +#include -// Global lttoolbox features -constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; -enum LT_FEATURES : uint64_t { - LTF_UNKNOWN = (1ull << 0), // Features >= this are unknown, so throw an error; Inc this if more features are added - LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits -}; - -// Invididual transducer features -constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; -enum TD_FEATURES : uint64_t { - TDF_WEIGHTS = (1ull << 0), - TDF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added - TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits -}; - +using namespace std; inline auto write_u64(FILE *out, uint64_t value) { auto rv = fwrite_unlocked(reinterpret_cast(&value), 1, sizeof(value), out); diff --git a/lttoolbox/endian_util.h b/lttoolbox/endian_util.h new file mode 100644 index 00000000..fadbbacc --- /dev/null +++ b/lttoolbox/endian_util.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_ENDIAN_UTIL_ +#define _LT_ENDIAN_UTIL_ + +#include +#include +#include +#include + +inline uint32_t to_le_32(uint32_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + bytes[3] = (v >> 24) & 0xFF; + bytes[2] = (v >> 16) & 0xFF; + bytes[1] = (v >> 8) & 0xFF; + bytes[0] = v & 0xFF; + return v; +} + +inline uint32_t from_le_32(uint32_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + v = ((bytes[3] << 24) | + (bytes[2] << 16) | + (bytes[1] << 8) | + bytes[0]); + return v; +} + +inline uint64_t to_le_64(uint64_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + bytes[7] = (v >> 56) & 0xFF; + bytes[6] = (v >> 48) & 0xFF; + bytes[5] = (v >> 40) & 0xFF; + bytes[4] = (v >> 32) & 0xFF; + bytes[3] = (v >> 24) & 0xFF; + bytes[2] = (v >> 16) & 0xFF; + bytes[1] = (v >> 8) & 0xFF; + bytes[0] = v & 0xFF; + return v; +} + +inline uint64_t from_le_64(uint64_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + v = ((static_cast(bytes[7]) << 56ull) | + (static_cast(bytes[6]) << 48ull) | + (static_cast(bytes[5]) << 40ull) | + (static_cast(bytes[4]) << 32ull) | + (static_cast(bytes[3]) << 24ull) | + (static_cast(bytes[2]) << 16ull) | + (static_cast(bytes[1]) << 8ull) | + (static_cast(bytes[0]))); + return v; +} + +inline auto write_le_32(FILE* out, uint32_t value) { + uint32_t v = to_le_32(value); + auto rv = fwrite_unlocked(reinterpret_cast(&v), 1, sizeof(value), out); + if (rv != sizeof(value)) { + throw std::runtime_error("Failed to write uint32_t"); + } + return rv; +} + +inline auto write_le_64(FILE* out, uint64_t value) { + uint64_t v = to_le_64(value); + auto rv = fwrite_unlocked(reinterpret_cast(&v), 1, sizeof(value), out); + if (rv != sizeof(value)) { + throw std::runtime_error("Failed to write uint64_t"); + } + return rv; +} + +inline auto read_le_32(FILE* in) { + uint32_t value = 0; + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + throw std::runtime_error("Failed to read uint64_t"); + } + return from_le_32(value); +} + +inline auto read_le_64(FILE* in) { + uint64_t value = 0; + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + throw std::runtime_error("Failed to read uint64_t"); + } + return from_le_64(value); +} + +inline auto write_le_s32(FILE* out, int32_t value) { + return write_le_32(out, *reinterpret_cast(&value)); +} + +inline auto read_le_s32(FILE* in) { + uint32_t val = read_le_32(in); + return *reinterpret_cast(&val); +} + +inline auto write_le_double(FILE* out, double value) { + return write_le_64(out, *reinterpret_cast(&value)); +} + +inline auto read_le_double(FILE* in) { + uint64_t val = read_le_64(in); + return *reinterpret_cast(&val); +} + +#endif diff --git a/lttoolbox/file_utils.cc b/lttoolbox/file_utils.cc index 98c0a006..37bdc067 100644 --- a/lttoolbox/file_utils.cc +++ b/lttoolbox/file_utils.cc @@ -16,9 +16,13 @@ */ #include -#include +#include +#include +#include +#include #include +#include UFILE* openOutTextFile(const std::string& fname) @@ -72,66 +76,190 @@ writeTransducerSet(FILE* output, const UString& letters, { fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; - write_le(output, features); + features |= LTF_MMAP; + write_le_64(output, features); - Compression::string_write(letters, output); - alpha.write(output); - Compression::multibyte_write(trans.size(), output); + uint64_t transducer_count = trans.size(); + + StringWriter sw; + StringRef letter_loc = sw.add(letters); + for (auto& it : alpha.getTags()) { + sw.add(it); + } + for (auto& it : trans) { + sw.add(it.first); + } + sw.write(output); + + // letters + write_le_32(output, letter_loc.start); + write_le_32(output, letter_loc.count); + + // symbols + alpha.write_mmap(output, sw); + + // transducers + write_le_64(output, transducer_count); for (auto& it : trans) { - Compression::string_write(it.first, output); - it.second.write(output); std::cout << it.first << " " << it.second.size(); std::cout << " " << it.second.numberOfTransitions() << std::endl; + StringRef loc = sw.add(it.first); + write_le_32(output, loc.start); + write_le_32(output, loc.count); + it.second.write_mmap(output, alpha); } } void -readShared(FILE* input, std::set& letters, Alphabet& alpha) +readTransducerSet(FILE* input, std::set& letters, + Alphabet& alpha, + std::map& trans) { - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(input); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - } else { - // Old binary format - fsetpos(input, &pos); + uint64_t features; + bool mmap = false; + if (readHeader(input, HEADER_LTTOOLBOX, features)) { + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } - for (int len = Compression::multibyte_read(input); len > 0; len--) { - letters.insert(static_cast(Compression::multibyte_read(input))); - } + UString letters_str; - alpha.read(input); -} + if (mmap) { + // make copies of all the strings we get from StringWriter + // because it gets deallocated when the function returns + StringWriter sw; + sw.read(input); -void -readTransducerSet(FILE* input, std::set& letters, - Alphabet& alpha, - std::map& trans) -{ - readShared(input, letters, alpha); + // letters + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + letters_str = UString{sw.get(s, c)}; + + // symbols + alpha.read_mmap(input, sw); + + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{sw.get(s, c)}; + trans[name].read_mmap(input, alpha); + } + } else { + // letters + OldBinary::read_ustr(input, letters_str, true); + + // symbols + alpha.read(input); + + int len = OldBinary::read_int(input, true); + + while(len > 0) { + UString name; + OldBinary::read_ustr(input, name, true); + trans[name].read(input); - for (int len = Compression::multibyte_read(input); len > 0; len--) { - UString name = Compression::string_read(input); - trans[name].read(input); + len--; + } } + letters = std::set(letters_str.begin(), letters_str.end()); } void -readTransducerSet(FILE* input, std::set& letters, - Alphabet& alpha, - std::map& trans) +readTransducerSet(FILE* input, + bool& mmapping, void* mmap_ptr, int& mmap_len, + StringWriter& str_write, + std::set* letters, AlphabetExe& alpha, + std::map& trans) { - readShared(input, letters, alpha); + uint64_t features; + bool mmap = false; + if (readHeader(input, HEADER_LTTOOLBOX, features)) { + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); + } + mmap = features & LTF_MMAP; + } + + if (mmap) { + fpos_t pos; + fgetpos(input, &pos); + rewind(input); + mmapping = mmap_file(input, mmap_ptr, mmap_len); + if (mmapping) { + void* ptr = mmap_ptr + 12; + ptr = str_write.init(ptr); + + if (letters != nullptr) { + StringRef let_loc = reinterpret_cast(ptr)[0]; + std::vector vec; + ustring_to_vec32(str_write.get(let_loc), vec); + letters->insert(vec.begin(), vec.end()); + ptr += sizeof(StringRef); + } + + ptr = alpha.init(ptr); + + uint64_t tr_count = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + for (uint64_t i = 0; i < tr_count; i++) { + StringRef tn = reinterpret_cast(ptr)[0]; + ptr += sizeof(StringRef); + UString name = UString{str_write.get(tn)}; + ptr = trans[name].init(ptr); + } + } else { + fsetpos(input, &pos); + + str_write.read(input); - for (int len = Compression::multibyte_read(input); len > 0; len--) { - UString name = Compression::string_read(input); - trans[name].read(input, alpha); + if (letters != nullptr) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + std::vector vec; + ustring_to_vec32(str_write.get(s, c), vec); + letters->insert(vec.begin(), vec.end()); + } + + alpha.read(input, true); + + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{str_write.get(s, c)}; + trans[name].read(input); + } + } + } else { + uint64_t len; + + if (letters != nullptr) { + // letters + len = OldBinary::read_int(input); + while(len > 0) { + letters->insert(static_cast(OldBinary::read_int(input))); + len--; + } + } + + // symbols + fpos_t pos; + fgetpos(input, &pos); + alpha.read(input, false); + fsetpos(input, &pos); + Alphabet temp; + temp.read(input); + + len = OldBinary::read_int(input); + + while(len > 0) { + UString name; + OldBinary::read_ustr(input, name); + trans[name].read_compressed(input, temp); + len--; + } } } diff --git a/lttoolbox/file_utils.h b/lttoolbox/file_utils.h index a2c81285..fae96b24 100644 --- a/lttoolbox/file_utils.h +++ b/lttoolbox/file_utils.h @@ -18,8 +18,11 @@ #ifndef __FILE_UTILS_H__ #include +#include +#include #include #include +#include #include @@ -37,4 +40,11 @@ void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, std::map& trans); +// if letters == nullptr, then skip it (e.g. in lrx) +void readTransducerSet(FILE* input, + bool& mmapping, void* mmap_ptr, int& mmap_len, + StringWriter& str_write, + std::set* letters, AlphabetExe& alpha, + std::map& trans); + #endif // __FILE_UTILS_H__ diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index dcfdb945..c704efc8 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -15,10 +15,14 @@ * along with this program; if not, see . */ #include -#include +#include +#include #include +#include +#include #include #include +#include #include #include @@ -37,6 +41,7 @@ UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; FSTProcessor::FSTProcessor() + : alphabet(AlphabetExe(&str_write)) { // escaped_chars chars escaped_chars.insert('['); @@ -57,6 +62,13 @@ FSTProcessor::FSTProcessor() } } +FSTProcessor::~FSTProcessor() +{ + if (mmapping) { + munmap(mmap_pointer, mmap_len); + } +} + void FSTProcessor::streamError() { @@ -643,11 +655,12 @@ FSTProcessor::flushBlanks(UFILE *output) void FSTProcessor::calcInitial() { + std::set temp; for(auto& it : transducers) { - root.addTransition(0, 0, it.second.getInitial(), default_weight); + temp.insert(&it.second); } - initial_state.init(&root); + initial_state.init(temp); } void @@ -656,23 +669,19 @@ FSTProcessor::classifyFinals() for(auto& it : transducers) { if(StringUtils::endswith(it.first, "@inconditional"_u)) { - inconditional.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + inconditional.insert(&it.second); } else if(StringUtils::endswith(it.first, "@standard"_u)) { - standard.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + standard.insert(&it.second); } else if(StringUtils::endswith(it.first, "@postblank"_u)) { - postblank.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + postblank.insert(&it.second); } else if(StringUtils::endswith(it.first, "@preblank"_u)) { - preblank.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + preblank.insert(&it.second); } else { @@ -720,7 +729,7 @@ FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) u_fputc('\\', output); } u_fputc(str[i], output); - if (str[i] == ' ') { + if (str[i] == ' ' && !blankqueue.empty()) { if (blankqueue.front() == " "_u) { blankqueue.pop(); } else { @@ -750,8 +759,6 @@ FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) } } - - void FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) { @@ -819,17 +826,17 @@ FSTProcessor::printSpace(UChar32 const val, UFILE *output) } void -FSTProcessor::printChar(const UChar32 val, UFILE* output) +FSTProcessor::writeChar(const UChar32 val, UFILE* output, bool single_blank) { - if (u_isspace(val)) { - if (blankqueue.size() > 0) { + if(u_isspace(val)) { + if (single_blank && !blankqueue.empty()) { write(blankqueue.front(), output); blankqueue.pop(); } else { - u_fputc(val, output); + printSpace(val, output); } } else { - if (isEscaped(val)) { + if(isEscaped(val)) { u_fputc('\\', output); } if (val) { @@ -853,7 +860,8 @@ FSTProcessor::isAlphabetic(UChar32 const c) const void FSTProcessor::load(FILE *input) { - readTransducerSet(input, alphabetic_chars, alphabet, transducers); + readTransducerSet(input, mmapping, mmap_pointer, mmap_len, + str_write, &alphabetic_chars, alphabet, transducers); } void @@ -873,8 +881,7 @@ FSTProcessor::initTMAnalysis() calcInitial(); for(auto& it : transducers) { - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + all_finals.insert(&it.second); } } @@ -884,8 +891,7 @@ FSTProcessor::initGeneration() setIgnoredChars(false); calcInitial(); for(auto& it : transducers) { - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + all_finals.insert(&it.second); } } @@ -955,7 +961,7 @@ FSTProcessor::initDecompositionSymbols() } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, ""_u); + alphabet.clearSymbol(compoundOnlyLSymbol); } if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 @@ -968,7 +974,7 @@ FSTProcessor::initDecompositionSymbols() } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, ""_u); + alphabet.clearSymbol(compoundRSymbol); } } @@ -1139,7 +1145,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) } else if(!isAlphabetic(val) && sf.empty()) { - printChar(val, output); + writeChar(val, output, true); } else if(last_postblank) { @@ -1342,18 +1348,7 @@ FSTProcessor::tm_analysis(InputFile& input, UFILE *output) { if((u_isspace(val) || u_ispunct(val)) && sf.empty()) { - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + writeChar(val, output, false); } else if(!u_isspace(val) && !u_ispunct(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || @@ -1821,77 +1816,41 @@ FSTProcessor::biltransfull(UString const &input_word, bool with_delim) { State current_state = initial_state; UString result; - unsigned int start_point = 1; - unsigned int end_point = input_word.size()-2; + unsigned int start = 1; + unsigned int end = input_word.size()-2; UString queue; bool mark = false; if(with_delim == false) { - start_point = 0; - end_point = input_word.size()-1; + start = 0; + end = input_word.size()-1; } - if(input_word[start_point] == '*') + if(input_word[start] == '*') { return input_word; } - if(input_word[start_point] == '=') + if(input_word[start] == '=') { - start_point++; + start++; mark = true; } - bool firstupper = u_isupper(input_word[start_point]); - bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; + bool firstupper = u_isupper(input_word[start]); + bool uppercase = firstupper && u_isupper(input_word[start+1]); - if(input_word[i] == '\\') - { - i++; - val = static_cast(input_word[i]); + for (auto it = symbol_iter(input_word.substr(start, end-start+1), &alphabet); it != it.end(); ++it) { + if (current_state.size() != 0) { + current_state.step_case(*it, caseSensitive); } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast(input_word[i]); - } - if(current_state.size() != 0) - { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } - } - if(current_state.isFinal(all_finals)) - { + if (current_state.isFinal(all_finals)) { result.clear(); - if(with_delim) { + if (with_delim) { result += '^'; } - if(mark) { + if (mark) { result += '='; } result += current_state.filterFinals(all_finals, alphabet, @@ -1902,9 +1861,9 @@ FSTProcessor::biltransfull(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(!symbol.empty() && !result.empty()) + if(alphabet.isTag(*it) && !result.empty()) { - queue.append(symbol); + queue.append(it.string()); } else { @@ -1922,7 +1881,7 @@ FSTProcessor::biltransfull(UString const &input_word, bool with_delim) } } - if(start_point < (end_point - 3)) + if(start < (end - 3)) { return "^$"_u; } @@ -1999,44 +1958,10 @@ FSTProcessor::biltrans(UString const &input_word, bool with_delim) bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast(input_word[i]); - } + for (auto it = symbol_iter(input_word.substr(start_point, end_point-start_point+1), &alphabet); it != it.end(); it++) { if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(*it, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -2055,9 +1980,9 @@ FSTProcessor::biltrans(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(!symbol.empty() && !result.empty()) + if(alphabet.isTag(*it) && !result.empty()) { - queue.append(symbol); + queue.append(it.string()); } else { @@ -2355,45 +2280,13 @@ FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val = 0; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = input_word[i]; - } - else if(input_word[i] == '<') - { + for (auto it = symbol_iter(input_word.substr(start_point, end_point-start_point+1), &alphabet); it != it.end(); it++) { + if (alphabet.isTag(*it)) { seentags = true; - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = input_word[i]; } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(*it, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -2412,9 +2305,9 @@ FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(!symbol.empty() && !result.empty()) + if(alphabet.isTag(*it) && !result.empty()) { - queue.append(symbol); + queue.append(it.string()); } else { @@ -2521,44 +2414,10 @@ FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast(input_word[i]); - } + for (auto it = symbol_iter(input_word.substr(start_point, end_point-start_point+1), &alphabet); it != it.end(); it++) { if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(*it, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -2577,7 +2436,7 @@ FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(symbol.empty()) + if(!alphabet.isTag(*it)) { // word is not present if(with_delim) @@ -2764,18 +2623,7 @@ FSTProcessor::SAO(InputFile& input, UFILE *output) { if(!isAlphabetic(val) && sf.empty()) { - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + writeChar(val, output, false); } else if(last_incond) { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 422ddfbf..3e6ec67b 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -19,12 +19,13 @@ #define _FSTPROCESSOR_ #include +#include #include -#include #include #include #include -#include +#include +#include #include #include @@ -57,7 +58,7 @@ class FSTProcessor /** * Transducers in FSTP */ - std::map transducers; + std::map transducers; /** * Current state of lexical analysis @@ -77,27 +78,27 @@ class FSTProcessor /** * The final states of inconditional sections in the dictionaries */ - std::map inconditional; + std::set inconditional; /** * The final states of standard sections in the dictionaries */ - std::map standard; + std::set standard; /** * The final states of postblank sections in the dictionaries */ - std::map postblank; + std::set postblank; /** * The final states of preblank sections in the dictionaries */ - std::map preblank; + std::set preblank; /** * Merge of 'inconditional', 'standard', 'postblank' and 'preblank' sets */ - std::map all_finals; + std::set all_finals; /** * Queue of blanks, used in reading methods @@ -136,10 +137,15 @@ class FSTProcessor */ int rcx_current_char; + /** + * String manager + */ + StringWriter str_write; + /** * Alphabet */ - Alphabet alphabet; + AlphabetExe alphabet; /** * Input buffer @@ -242,6 +248,10 @@ class FSTProcessor */ int maxWeightClasses = INT_MAX; + bool mmapping = false; + void* mmap_pointer = nullptr; + int mmap_len = 0; + /** * Prints an error of input stream and exits */ @@ -412,6 +422,8 @@ class FSTProcessor */ void printChar(const UChar32 val, UFILE* output); + void writeChar(const UChar32 val, UFILE* output, bool single_blank); + void skipUntil(InputFile& input, UFILE *output, UChar32 const character); static UString removeTags(UString const &str); UString compoundAnalysis(UString str); @@ -458,6 +470,7 @@ class FSTProcessor static UString const WBLANK_FINAL; FSTProcessor(); + ~FSTProcessor(); void initAnalysis(); void initTMAnalysis(); diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index f83288d8..36ab2459 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -16,6 +16,8 @@ */ #include #include +#include +#include #include #include diff --git a/lttoolbox/match_state.cc b/lttoolbox/match_state.cc index 097af0d6..e775a2c8 100644 --- a/lttoolbox/match_state.cc +++ b/lttoolbox/match_state.cc @@ -15,7 +15,6 @@ * along with this program; if not, see . */ #include -#include #include #include diff --git a/lttoolbox/match_state2.cc b/lttoolbox/match_state2.cc new file mode 100644 index 00000000..18ba8781 --- /dev/null +++ b/lttoolbox/match_state2.cc @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +#include + +MatchState2::MatchState2(TransducerExe* t) : + trans(t) +{ + buffer[0] = trans->initial; + last = 1; +} + +MatchState2::~MatchState2() +{} + +void +MatchState2::copy(const MatchState2& o) +{ + trans = o.trans; + first = o.first; + last = o.last; + for (uint16_t i = first; i != last; i = (i + 1) % BUF_LIMIT) { + buffer[i] = o.buffer[i]; + } +} + +MatchState2::MatchState2(const MatchState2& o) +{ + copy(o); +} + +MatchState2& +MatchState2::operator=(const MatchState2& o) +{ + if (this != &o) { + copy(o); + } + return *this; +} + +uint16_t +MatchState2::size() const +{ + return (last + BUF_LIMIT - first) % BUF_LIMIT; +} + +bool +MatchState2::empty() const +{ + return last == first; +} + +void +MatchState2::applySymbol(const uint64_t state, const int32_t symbol) +{ + uint64_t start = 0; + uint64_t end = 0; + trans->get_range(state, symbol, start, end); + for (uint64_t i = start; i < end; i++) { + buffer[last] = trans->transitions[i].dest; + last = (last + 1) % BUF_LIMIT; + } +} + +void +MatchState2::step(const int32_t input) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + } + first = temp_last; +} + +void +MatchState2::step(const int32_t input, const int32_t alt) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + applySymbol(buffer[i], alt); + } + first = temp_last; +} + +void +MatchState2::step(const int32_t input, const int32_t alt1, int32_t alt2) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + applySymbol(buffer[i], alt1); + applySymbol(buffer[i], alt2); + } + first = temp_last; +} + +void +MatchState2::step(UString_view input, const AlphabetExe& alpha, bool foldcase) +{ + int32_t any_char = alpha(""_u); + int32_t any_tag = alpha(""_u); + for (auto it = symbol_iter(input, &alpha); it != it.end(); it++) { + if (it.string()[0] == '<') { + if (*it) { + step(*it, any_tag); + } else { + step(any_tag); + } + } else { + if (foldcase && u_isupper(*it)) { + step(*it, u_tolower(*it), any_char); + } else { + step(*it, any_char); + } + } + } +} + +int +MatchState2::classifyFinals(const std::map& finals, + const std::set& banned_rules) const +{ + int ret = INT_MAX; + for (uint16_t i = first; i != last; i = (i+1)%BUF_LIMIT) { + auto it = finals.find(buffer[i]); + if (it != finals.end()) { + if (it->second < ret && + banned_rules.find(it->second) == banned_rules.end()) { + ret = it->second; + } + } + } + return (ret < INT_MAX) ? ret : -1; +} + +int +MatchState2::classifyFinals(const std::map& finals) const +{ + std::set empty; + return classifyFinals(finals, empty); +} + +void +MatchState2::clear() +{ + first = 0; + last = 1; + buffer[0] = trans->initial; +} diff --git a/lttoolbox/match_state2.h b/lttoolbox/match_state2.h new file mode 100644 index 00000000..4b0ca285 --- /dev/null +++ b/lttoolbox/match_state2.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_MATCH_STATE_ +#define _LT_MATCH_STATE_ + +#include +#include +#include + +// rename upon deleting old MatchState +class MatchState2 +{ +private: + static int const BUF_LIMIT = 1024; + TransducerExe* trans; + uint64_t buffer[1024]; + uint16_t first = 0; + uint16_t last = 0; + + void copy(const MatchState2& o); + void applySymbol(const uint64_t state, const int32_t symbol); +public: + MatchState2(TransducerExe* t); + ~MatchState2(); + MatchState2(const MatchState2& o); + MatchState2& operator=(const MatchState2& o); + + uint16_t size() const; + bool empty() const; + void step(const int32_t input); + void step(const int32_t input, const int32_t alt); + void step(const int32_t input, const int32_t alt1, const int32_t alt2); + void step(UString_view input, const AlphabetExe& alpha, bool foldcase = true); + int classifyFinals(const std::map& finals, + const std::set& banned_rules) const; + int classifyFinals(const std::map& finals) const; + void clear(); +}; + +#endif diff --git a/lttoolbox/mmap.h b/lttoolbox/mmap.h new file mode 100644 index 00000000..5b0b1dae --- /dev/null +++ b/lttoolbox/mmap.h @@ -0,0 +1,30 @@ +#ifndef _LT_MMAP_ +#define _LT_MMAP_ + +#include +#include +#include +#include +#include +#include + +//DEBUG +//#include +//#include + +inline bool mmap_file(FILE* fd, void*& ptr, int& len) +{ + struct stat sb; + if (fstat(fileno(fd), &sb) == -1) { + return false; + } + len = sb.st_size; + ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fileno(fd), 0); + if (ptr == MAP_FAILED) { + //std::cerr << "mmap failed\nerrno = " << errno << "\n"; + return false; + } + return true; +} + +#endif diff --git a/lttoolbox/old_binary.cc b/lttoolbox/old_binary.cc new file mode 100644 index 00000000..b33a436d --- /dev/null +++ b/lttoolbox/old_binary.cc @@ -0,0 +1,130 @@ +#include +#include +#include + +using namespace OldBinary; + +uint64_t +OldBinary::read_u64(FILE* in) +{ + uint64_t v = 0; + if (fread_unlocked(reinterpret_cast(&v), 1, sizeof(v), in) != sizeof(v)) { + throw std::runtime_error("Failed to read uint64_t"); + } + // these are unconditional byte-swaps, so on little-endian platforms + // this reads big-endian data + // this is very bad, but it's the way all the old data was written, + // so we have this here for backwards compatibility until we drop + // support for lttoolbox/apertium <= 3 + // -DGS 2021-08-21 + return (((v & 0xFF00000000000000) >> 56) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF00) << 40) | + ((v & 0xFF) << 56)); +} + +uint64_t read_byte(FILE* in) +{ + unsigned char ret = 0; + if (fread_unlocked(&ret, 1, 1, in) != 1) { + // for some reason things break if this is an error + //throw std::runtime_error("Failed to read byte"); + } + return ret; +} + +uint64_t +OldBinary::read_int(FILE* in, bool compression) +{ + if (compression) { + uint64_t up = read_byte(in); + if (up < 0x40) { + return up; + } else if (up < 0x80) { + return ((up & 0x3f) << 8) | read_byte(in); + } else if (up < 0xc0) { + uint64_t ret = (up & 0x3f) << 8; + ret |= read_byte(in); + return (ret << 8) | read_byte(in); + } else { + uint64_t ret = ((up & 0x3f) << 8) | read_byte(in); + ret = (ret << 8) | read_byte(in); + ret = (ret << 8) | read_byte(in); + return ret; + } + } else { + uint64_t ret = 0; + uint64_t size = read_byte(in); + if (size > 8) { + throw std::runtime_error("can't deserialise int"); + } + uint8_t buffer[8]; + if (fread_unlocked(buffer, 1, size, in) != size) { + throw std::runtime_error("can't deserialise int"); + } + for (uint8_t i = 0; i < size; i++) { + ret += static_cast(buffer[i]) << (8 * (size - i - 1)); + } + return ret; + } +} + +void +OldBinary::read_ustr(FILE* in, UString& s, bool compression) +{ + uint64_t count = read_int(in, compression); + for (uint64_t i = 0; i < count; i++) { + s += static_cast(read_int(in, compression)); + } +} + +void +OldBinary::read_str(FILE* in, std::string& s, bool compression) +{ + uint64_t count = read_int(in, compression); + for (uint64_t i = 0; i < count; i++) { + s += static_cast(read_int(in, compression)); + } +} + +double +OldBinary::read_double(FILE* in, bool compression, bool endian_util) +{ + if (compression) { + if (endian_util) { + double retval; +#ifdef WORDS_BIGENDIAN + fread_unlocked(&retval, sizeof(double), 1, input); +#else + char *s = reinterpret_cast(&retval); + + for(int i = sizeof(double)-1; i != -1; i--) { + if(fread_unlocked(&(s[i]), 1, 1, in)==0) { + return 0; + } + } +#endif + return retval; + } else { + uint64_t mantissa = read_int(in, true); + if (mantissa >= 0x04000000) { + mantissa = ((mantissa & 0x03ffffff) << 26) | read_int(in, true); + } + + uint64_t exponent = read_int(in, true); + if (exponent >= 0x04000000) { + exponent = ((exponent & 0x03ffffff) << 26) | read_int(in, true); + } + + double v = static_cast(static_cast(mantissa)) / 0x40000000; + return ldexp(v, static_cast(exponent)); + } + } else { + uint64_t d = read_int(in, false); + return *reinterpret_cast(&d); + } +} diff --git a/lttoolbox/old_binary.h b/lttoolbox/old_binary.h new file mode 100644 index 00000000..b7986911 --- /dev/null +++ b/lttoolbox/old_binary.h @@ -0,0 +1,16 @@ +#ifndef _LT_OLD_BINARY_ +#define _LT_OLD_BINARY_ + +#include +#include +#include + +namespace OldBinary { + uint64_t read_u64(FILE* in); + uint64_t read_int(FILE* in, bool compression=true); + void read_ustr(FILE* in, UString& s, bool compression=true); + void read_str(FILE* in, std::string& s, bool compression=true); + double read_double(FILE* in, bool compression=true, bool endian_util=false); +}; + +#endif diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index 34205f38..2246f46c 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -17,7 +17,6 @@ #include #include -#include #include //debug// @@ -25,8 +24,7 @@ //debug// State::State() -{ -} +{} State::~State() { @@ -41,21 +39,15 @@ State::State(State const &s) State & State::operator =(State const &s) { - if(this != &s) - { - destroy(); - copy(s); - } - + copy(s); return *this; } void State::destroy() { - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - delete state[i].sequence; + for (auto& it : state) { + delete it.sequence; } state.clear(); @@ -64,19 +56,17 @@ State::destroy() void State::copy(State const &s) { - // release references - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - delete state[i].sequence; + if (this == &s) { + return; } + destroy(); state = s.state; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - std::vector> *tmp = new std::vector>(); - *tmp = *(state[i].sequence); - state[i].sequence = tmp; + for (auto& it : state) { + TPath* tmp = new TPath(); + *tmp = *(it.sequence); + it.sequence = tmp; } } @@ -87,63 +77,62 @@ State::size() const } void -State::init(Node *initial) +State::init(const std::set& exes) { - state.clear(); - state.push_back(TNodeState(initial, new std::vector>(), false)); - state[0].sequence->clear(); + destroy(); + for (auto& it : exes) { + state.push_back(TNodeState(it, it->initial, new TPath(), false)); + } epsilonClosure(); } bool -State::apply_into(std::vector* new_state, int const input, int index, bool dirty) -{ - std::map::const_iterator it; - it = state[index].where->transitions.find(input); - if(it != state[index].where->transitions.end()) - { - for(int j = 0; j != it->second.size; j++) - { - std::vector> *new_v = new std::vector>(); - *new_v = *(state[index].sequence); - if(it->first != 0) - { - new_v->push_back(std::make_pair(it->second.out_tag[j], it->second.out_weight[j])); - } - new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); +State::apply_into(std::vector* new_state, const int32_t input, + int index, bool dirty) +{ + uint64_t start, end; + bool any = false; + TransducerExe* trans = state[index].where; + trans->get_range(state[index].state, input, start, end); + for (uint64_t i = start; i < end; i++) { + TPath* new_v = new TPath(); + *new_v = *(state[index].sequence); + if (input != 0) { + new_v->push_back(std::make_pair(trans->transitions[i].osym, + trans->transitions[i].weight)); } - return true; + new_state->push_back(TNodeState(trans, trans->transitions[i].dest, new_v, + state[index].dirty || dirty)); + any = true; } - return false; + return any; } bool -State::apply_into_override(std::vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty) -{ - std::map::const_iterator it; - it = state[index].where->transitions.find(input); - if(it != state[index].where->transitions.end()) - { - for(int j = 0; j != it->second.size; j++) - { - std::vector> *new_v = new std::vector>(); - *new_v = *(state[index].sequence); - if(it->first != 0) - { - if(it->second.out_tag[j] == old_sym) - { - new_v->push_back(std::make_pair(new_sym, it->second.out_weight[j])); - } - else - { - new_v->push_back(std::make_pair(it->second.out_tag[j], it->second.out_weight[j])); - } +State::apply_into_override(std::vector* new_state, + const int32_t input, + const int32_t old_sym, const int32_t new_sym, + int index, bool dirty) +{ + uint64_t start, end; + bool any = false; + TransducerExe* trans = state[index].where; + trans->get_range(state[index].state, input, start, end); + for (uint64_t i = start; i < end; i++) { + TPath* new_v = new TPath(); + *new_v = *(state[index].sequence); + if (input != 0) { + int32_t s = trans->transitions[i].osym; + if (s == old_sym) { + s = new_sym; } - new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); + new_v->push_back(std::make_pair(s, trans->transitions[i].weight)); } - return true; + new_state->push_back(TNodeState(trans, trans->transitions[i].dest, new_v, + state[index].dirty || dirty)); + any = true; } - return false; + return any; } void @@ -265,19 +254,18 @@ State::epsilonClosure() { for(size_t i = 0; i != state.size(); i++) { - auto it2 = state[i].where->transitions.find(0); - if(it2 != state[i].where->transitions.end()) - { - for(int j = 0 ; j != it2->second.size; j++) - { - std::vector> *tmp = new std::vector>(); - *tmp = *(state[i].sequence); - if(it2->second.out_tag[j] != 0) - { - tmp->push_back(std::make_pair(it2->second.out_tag[j], it2->second.out_weight[j])); - } - state.push_back(TNodeState(it2->second.dest[j], tmp, state[i].dirty)); + TransducerExe* trans = state[i].where; + uint64_t start, end; + trans->get_range(state[i].state, 0, start, end); + for (uint64_t j = start; j < end; j++) { + TPath* tmp = new TPath(); + *tmp = *(state[i].sequence); + if (trans->transitions[j].osym != 0) { + tmp->push_back(std::make_pair(trans->transitions[j].osym, + trans->transitions[j].weight)); } + state.push_back(TNodeState(trans, trans->transitions[j].dest, tmp, + state[i].dirty)); } } } @@ -432,12 +420,10 @@ State::step_case_override(UChar32 val, bool caseSensitive) bool -State::isFinal(std::map const &finals) const +State::isFinal(const std::set& finals) const { - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) - { + for (auto& it : state) { + if(finals.find(it.where) != finals.end() && it.where->is_final(it.state)) { return true; } } @@ -472,8 +458,8 @@ State::NFinals(std::vector> lf, int maxAnalyses, int UString -State::filterFinals(std::map const &finals, - Alphabet const &alphabet, +State::filterFinals(const std::set& finals, + AlphabetExe const &alphabet, std::set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar) const @@ -483,23 +469,22 @@ State::filterFinals(std::map const &finals, UString result; double cost = 0.0000; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { - if(state[i].dirty) + if(st.dirty) { result.clear(); cost = 0.0000; unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); - cost += ((*(state[i].sequence))[j]).second; + alphabet.getSymbol(result, ((*(st.sequence))[j]).first, uppercase); + cost += ((*(st.sequence))[j]).second; } if(firstupper) { @@ -518,19 +503,21 @@ State::filterFinals(std::map const &finals, { result.clear(); cost = 0.0000; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first); - cost += ((*(state[i].sequence))[j]).second; + alphabet.getSymbol(result, ((*(st.sequence))[j]).first); + cost += ((*(st.sequence))[j]).second; } } // Add the weight of the final state - cost += (*(finals.find(state[i].where))).second; + double temp; + st.where->find_final(st.state, temp); + cost += temp; response.push_back(make_pair(result, cost)); } } @@ -561,8 +548,8 @@ State::filterFinals(std::map const &finals, std::set > > -State::filterFinalsLRX(std::map const &finals, - Alphabet const &alphabet, +State::filterFinalsLRX(const std::set& finals, + AlphabetExe const &alphabet, std::set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { @@ -571,21 +558,20 @@ State::filterFinalsLRX(std::map const &finals, std::vector current_result; UString rule_id; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { current_result.clear(); rule_id.clear(); UString current_word; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { current_word += '\\'; } UString sym; - alphabet.getSymbol(sym, ((*(state[i].sequence))[j]).first, uppercase); + alphabet.getSymbol(sym, ((*(st.sequence))[j]).first, uppercase); if(sym == "<$>"_u) { if(!current_word.empty()) @@ -609,37 +595,36 @@ State::filterFinalsLRX(std::map const &finals, UString -State::filterFinalsSAO(std::map const &finals, - Alphabet const &alphabet, +State::filterFinalsSAO(const std::set& finals, + AlphabetExe const &alphabet, std::set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { UString result; UString annot; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { result += '/'; unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - if(alphabet.isTag(((*(state[i].sequence))[j]).first)) + if(alphabet.isTag(((*(st.sequence))[j]).first)) { annot.clear(); - alphabet.getSymbol(annot, ((*(state[i].sequence))[j]).first); + alphabet.getSymbol(annot, ((*(st.sequence))[j]).first); result += '&'; result += annot.substr(1,annot.length()-2); result += ';'; } else { - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); + alphabet.getSymbol(result, ((*(st.sequence))[j]).first, uppercase); } } if(firstupper) @@ -661,25 +646,25 @@ State::filterFinalsSAO(std::map const &finals, } UString -State::filterFinalsTM(std::map const &finals, - Alphabet const &alphabet, +State::filterFinalsTM(const std::set& finals, + AlphabetExe const &alphabet, std::set const &escaped_chars, - std::queue &blankqueue, std::vector &numbers) const + std::queue &blankqueue, + std::vector &numbers) const { UString result; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { result += '/'; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find((*(state[i].sequence))[j].first) != escaped_chars.end()) + if(escaped_chars.find((*(st.sequence))[j].first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, (*(state[i].sequence))[j].first); + alphabet.getSymbol(result, (*(st.sequence))[j].first); } } } @@ -777,12 +762,11 @@ State::pruneCompounds(int requiredSymbol, int separationSymbol, int compound_max for(unsigned int i = 0; i> seq = *state.at(i).sequence; - + std::vector> seq = *state[i].sequence; if(lastPartHasRequiredSymbol(seq, requiredSymbol, separationSymbol)) { int this_noOfCompoundElements = 0; - for (int j = seq.size()-2; j>0; j--) if ((seq.at(j)).first==separationSymbol) this_noOfCompoundElements++; + for (int j = seq.size()-2; j>0; j--) if ((seq[j]).first==separationSymbol) this_noOfCompoundElements++; noOfCompoundElements[i] = this_noOfCompoundElements; minNoOfCompoundElements = (minNoOfCompoundElements < this_noOfCompoundElements) ? minNoOfCompoundElements : this_noOfCompoundElements; @@ -884,34 +868,22 @@ State::lastPartHasRequiredSymbol(const std::vector> &seq, void -State::restartFinals(const std::map &finals, int requiredSymbol, State *restart_state, int separationSymbol) +State::restartFinals(const std::set& finals, int requiredSymbol, State *restart_state, int separationSymbol) { - - for(unsigned int i=0; i 0) - { - bool restart = lastPartHasRequiredSymbol(*(state_i.sequence), requiredSymbol, separationSymbol); - if(restart) - { - if(restart_state != NULL) - { - for(unsigned int j=0; jstate.size(); j++) - { - TNodeState initst = restart_state->state.at(j); - std::vector> *tnvec = new std::vector>; - - for(unsigned int k=0; k < state_i.sequence->size(); k++) - { - tnvec->push_back(state_i.sequence->at(k)); - } - TNodeState tn(initst.where, tnvec, state_i.dirty); - tn.sequence->push_back(std::make_pair(separationSymbol, 0.0000)); - state.push_back(tn); + if (finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { + bool restart = lastPartHasRequiredSymbol(*(st.sequence), requiredSymbol, separationSymbol); + if(restart && restart_state != NULL) { + for (auto& initst : restart_state->state) { + TPath* tnvec = new TPath(); + for (auto& it : *(st.sequence)) { + tnvec->push_back(it); } + TNodeState tn(initst.where, initst.state, tnvec, st.dirty); + tn.sequence->push_back(std::make_pair(separationSymbol, 0.0000)); + state.push_back(tn); } } } @@ -921,7 +893,7 @@ State::restartFinals(const std::map &finals, int requiredSymbol, UString -State::getReadableString(const Alphabet &a) +State::getReadableString(const AlphabetExe &a) { UString retval; retval += '['; diff --git a/lttoolbox/state.h b/lttoolbox/state.h index d9b67199..42d1c841 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -24,15 +24,17 @@ #include #include -#include +#include #include #include #include #include - +#include #include +typedef std::vector> TPath; + /** * Class to represent the current state of transducer processing */ @@ -44,22 +46,24 @@ class State */ struct TNodeState { - Node *where; - std::vector> *sequence; + TransducerExe* where; + uint64_t state; + TPath* sequence; // a state is "dirty" if it was introduced at runtime (case variants, etc.) bool dirty; - TNodeState(Node * const &w, std::vector> * const &s, bool const &d): where(w), sequence(s), dirty(d){} + TNodeState(TransducerExe* w, uint64_t i, TPath* s, bool d) + : where(w), state(i), sequence(s), dirty(d){} TNodeState(const TNodeState& other) - : where(other.where) - , sequence(other.sequence) - , dirty(other.dirty) + : where(other.where), state(other.state), + sequence(other.sequence), dirty(other.dirty) {} TNodeState & operator=(TNodeState const &other) { where = other.where; + state = other.state; sequence = other.sequence; dirty = other.dirty; return *this; @@ -205,7 +209,7 @@ class State * Init the state with the initial node and empty output * @param initial the initial node of the transducer */ - void init(Node *initial); + void init(const std::set& exes); /** * Remove states not containing a specific symbol in their last 'part', and states @@ -261,8 +265,8 @@ class State * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinals(std::map const &finals, - Alphabet const &a, + UString filterFinals(const std::set& finals, + AlphabetExe const &a, std::set const &escaped_chars, bool display_weights = false, int max_analyses = INT_MAX, @@ -282,8 +286,8 @@ class State * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinalsSAO(std::map const &finals, - Alphabet const &a, + UString filterFinalsSAO(const std::set& finals, + AlphabetExe const &a, std::set const &escaped_chars, bool uppercase = false, bool firstupper = false, @@ -302,12 +306,13 @@ class State * @return the result of the transduction */ - std::set > > filterFinalsLRX(std::map const &finals, - Alphabet const &a, - std::set const &escaped_chars, - bool uppercase = false, - bool firstupper = false, - int firstchar = 0) const; + std::set > > + filterFinalsLRX(const std::set& finals, + AlphabetExe const &a, + std::set const &escaped_chars, + bool uppercase = false, + bool firstupper = false, + int firstchar = 0) const; @@ -321,8 +326,7 @@ class State * @param restart_state * @param separationSymbol */ - void restartFinals(const std::map &finals, int requiredSymbol, State *restart_state, int separationSymbol); - + void restartFinals(const std::set& finals, int requiredSymbol, State *restart_state, int separationSymbol); /** * Returns true if at least one record of the state references a @@ -330,15 +334,15 @@ class State * @param finals set of final nodes @return * @true if the state is final */ - bool isFinal(std::map const &finals) const; + bool isFinal(const std::set& finals) const; /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ - UString getReadableString(const Alphabet &a); + UString getReadableString(const AlphabetExe &a); - UString filterFinalsTM(std::map const &finals, - Alphabet const &alphabet, + UString filterFinalsTM(const std::set& finals, + AlphabetExe const &alphabet, std::set const &escaped_chars, std::queue &blanks, std::vector &numbers) const; diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc index 59b30a0e..7dbd4828 100644 --- a/lttoolbox/string_utils.cc +++ b/lttoolbox/string_utils.cc @@ -67,6 +67,27 @@ StringUtils::split(const UString& str, const UString& delim) return result; } +std::vector +StringUtils::split_escape(UString_view str, const UChar delim) +{ + std::vector ret; + size_t last = 0; + for (size_t i = 0; i < str.size(); i++) { + if (str[i] == '\\') { + i++; + } else if (str[i] == delim) { + if (i > last) { + ret.push_back(str.substr(last, i-last)); + } + last = i+1; + } + } + if (str.size() > last) { + ret.push_back(str.substr(last)); + } + return ret; +} + UString StringUtils::join(const std::vector& vec, const UString& delim) { diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h index adf2d29d..979df4e9 100644 --- a/lttoolbox/string_utils.h +++ b/lttoolbox/string_utils.h @@ -11,6 +11,8 @@ class StringUtils { // split string on delimiter static std::vector split(const UString& str, const UString& delim); + // split, but respect \ escapes + static std::vector split_escape(UString_view str, const UChar delim); // inverse of split static UString join(const std::vector& vec, const UString& delim); diff --git a/lttoolbox/string_view.h b/lttoolbox/string_view.h new file mode 100644 index 00000000..f5bf267b --- /dev/null +++ b/lttoolbox/string_view.h @@ -0,0 +1,40 @@ +/* +* Copyright (C) 2021 Apertium +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +*/ + +#pragma once +#ifndef LT_STRING_VIEW_HPP__ +#define LT_STRING_VIEW_HPP__ + +#ifdef HAVE_STRING_VIEW + #include +#else + #include + #include + + namespace std { + using string_view = ::std::experimental::string_view; + template + using basic_string_view = ::std::experimental::basic_string_view; + + inline ::std::string& operator+=(::std::string& str, ::std::string_view sv) { + str.append(sv.begin(), sv.end()); + return str; + } + } +#endif + +#endif diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc new file mode 100644 index 00000000..0f23be5d --- /dev/null +++ b/lttoolbox/string_writer.cc @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include +#include + +StringRef +StringWriter::add(UString_view s) +{ + auto start = edit_buffer.find(s); + if (start == UString::npos) { + start = edit_buffer.size(); + edit_buffer += s; + } + StringRef ret; + ret.start = start; + ret.count = s.size(); + return ret; +} + +StringRef +StringWriter::add_new(UString_view s) +{ + StringRef ret; + ret.start = edit_buffer.size(); + ret.count = s.size(); + edit_buffer += s; + return ret; +} + +StringRef +StringWriter::find(UString_view s) const +{ + StringRef ret; + ret.start = 0; + ret.count = 0; + UString_view buf; + if (mmapping) { + buf = UString_view(mmap_buffer, mmap_size); + } else { + buf = UString_view(edit_buffer); + } + auto start = buf.find(s); + if (start != UString_view::npos) { + ret.start = start; + ret.count = s.size(); + } + return ret; +} + +UString_view +StringWriter::get(const uint32_t start, const uint32_t count) +{ + if (mmapping) { + UString_view ret(mmap_buffer, mmap_size); + return ret.substr(start, count); + } else { + UString_view ret(edit_buffer); + return ret.substr(start, count); + } +} + +UString_view +StringWriter::get(const StringRef& ref) +{ + return get(ref.start, ref.count); +} + +void +StringWriter::read(FILE* in) +{ + uint64_t len = read_le_64(in); + edit_buffer.clear(); + edit_buffer.reserve(len); + uint8_t temp[len*2]{}; + if (fread_unlocked(&temp, 1, len*2, in) != len*2) { + throw std::runtime_error("Failed to read strings"); + } + uint16_t c; + for (uint64_t i = 0; i < len*2; i += 2) { + edit_buffer += static_cast(temp[i] | (temp[i+1] << 8)); + } +} + +void +StringWriter::write(FILE* out) +{ + write_le_64(out, edit_buffer.size()); + uint8_t temp[edit_buffer.size()*2]{}; + for (uint64_t i = 0; i < edit_buffer.size(); i++) { + temp[2*i] = edit_buffer[i] & 0xFF; + temp[2*i+1] = (edit_buffer[i] >> 8) & 0xFF; + } + if (fwrite_unlocked(&temp, 1, edit_buffer.size()*2, out) != edit_buffer.size()*2) { + throw std::runtime_error("Failed to write strings"); + } +} + +void* +StringWriter::init(void* ptr) +{ + mmapping = true; + mmap_size = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + mmap_buffer = reinterpret_cast(ptr); + get(0, mmap_size); + return ptr + sizeof(UChar)*mmap_size; +} diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h new file mode 100644 index 00000000..6adcdd47 --- /dev/null +++ b/lttoolbox/string_writer.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_STRING_WRITER_ +#define _LT_STRING_WRITER_ + +#include +#include +#include + +struct StringRef { + uint32_t start; + uint32_t count; +}; + +class StringWriter { +private: + bool mmapping = false; + UString edit_buffer; + uint64_t mmap_size; + UChar* mmap_buffer; +public: + StringRef add(UString_view s); + // don't check for duplicates + // faster if you're not going to compare the StringRefs + StringRef add_new(UString_view s); + StringRef find(UString_view s) const; + UString_view get(const uint32_t start, const uint32_t count); + UString_view get(const StringRef& ref); + void read(FILE* in); + void write(FILE* out); + void* init(void* ptr); +}; + +#endif diff --git a/lttoolbox/symbol_iter.cc b/lttoolbox/symbol_iter.cc new file mode 100644 index 00000000..c9e9048b --- /dev/null +++ b/lttoolbox/symbol_iter.cc @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +symbol_iter::symbol_iter(UString_view s_, const AlphabetExe* a_) : s(s_), a(a_) +{ + ++(*this); +} + +symbol_iter::symbol_iter(const symbol_iter& other) +{ + i = other.i; + j = other.j; + s = other.s; + a = other.a; + sym = other.sym; +} + +symbol_iter::~symbol_iter() {} + +int32_t +symbol_iter::operator*() const +{ + return sym; +} + +symbol_iter +symbol_iter::operator++(int) +{ + auto temp = *this; + ++(*this); + return temp; +} + +symbol_iter& +symbol_iter::operator++() +{ + if (i == s.size()) { + return *this; + } + i = j; + j++; + if (s[i] == '\\') { + j++; + sym = s[i+1]; + } else if (s[i] == '<') { + while (j < s.size() && s[j] != '>') j++; + j++; + sym = (*a)(s.substr(i, j)); + } else { + sym = s[i]; + } + return *this; +} + +bool +symbol_iter::operator!=(const symbol_iter& o) const +{ + return (i != o.i) || (j != o.j) || (s != o.s) || (a != o.a) || (sym != o.sym); +} + +bool +symbol_iter::operator==(const symbol_iter& o) const +{ + return (i == o.i) && (j == o.j) && (s == o.s) && (a == o.a) && (sym == o.sym); +} + +symbol_iter +symbol_iter::begin() +{ + return symbol_iter(s, a); +} + +symbol_iter +symbol_iter::end() +{ + symbol_iter ret(s, a); + ret.j = s.size(); + ++ret; + return ret; +} + +UString_view +symbol_iter::string() +{ + return s.substr(i,j); +} diff --git a/lttoolbox/symbol_iter.h b/lttoolbox/symbol_iter.h new file mode 100644 index 00000000..eebe6b3b --- /dev/null +++ b/lttoolbox/symbol_iter.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef __SYMBOL_ITER_H__ +#define __SYMBOL_ITER_H__ + +#include +#include + +class symbol_iter +{ +private: + size_t i = 0; + size_t j = 0; + UString_view s; + const AlphabetExe* a; + int32_t sym = 0; +public: + symbol_iter(UString_view s_, const AlphabetExe* a_); + symbol_iter(const symbol_iter& other); + ~symbol_iter(); + int32_t operator*() const; + symbol_iter operator++(int); + symbol_iter &operator++(); + bool operator!=(const symbol_iter& other) const; + bool operator==(const symbol_iter& other) const; + symbol_iter begin(); + symbol_iter end(); + UString_view string(); +}; + +#endif // __SYMBOL_ITER_H__ diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 3dd0abcd..fd5cb62a 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -602,26 +604,16 @@ Transducer::read(FILE *input, int const decalage) Transducer new_t; bool read_weights = false; - - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { - auto features = read_le(input); - if (features >= TDF_UNKNOWN) { - throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); - } - read_weights = (features & TDF_WEIGHTS); - } - else { - // Old binary format - fsetpos(input, &pos); - } + uint64_t features; + if (readHeader(input, HEADER_TRANSDUCER, features)) { + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + read_weights = (features & TDF_WEIGHTS); } - new_t.initial = Compression::multibyte_read(input); - int finals_size = Compression::multibyte_read(input); + new_t.initial = OldBinary::read_int(input, true); + int finals_size = OldBinary::read_int(input, true); int base = 0; double base_weight = default_weight; @@ -629,29 +621,29 @@ Transducer::read(FILE *input, int const decalage) { finals_size--; - base += Compression::multibyte_read(input); + base += OldBinary::read_int(input, true); if(read_weights) { - base_weight = Compression::long_multibyte_read(input); + base_weight = OldBinary::read_double(input, true); } new_t.finals.insert(std::make_pair(base, base_weight)); } - base = Compression::multibyte_read(input); + base = OldBinary::read_int(input, true); int number_of_states = base; int current_state = 0; while(number_of_states > 0) { - int number_of_local_transitions = Compression::multibyte_read(input); + int number_of_local_transitions = OldBinary::read_int(input, true); int tagbase = 0; while(number_of_local_transitions > 0) { number_of_local_transitions--; - tagbase += Compression::multibyte_read(input) - decalage; - int state = (current_state + Compression::multibyte_read(input)) % base; + tagbase += OldBinary::read_int(input, true) - decalage; + int state = (current_state + OldBinary::read_int(input, true)) % base; if(read_weights) { - base_weight = Compression::long_multibyte_read(input); + base_weight = OldBinary::read_double(input, true); } if(new_t.transitions.find(state) == new_t.transitions.end()) { @@ -666,6 +658,121 @@ Transducer::read(FILE *input, int const decalage) *this = new_t; } +void +Transducer::read_mmap(FILE* in, Alphabet& alpha) +{ + uint64_t features; + if (readHeader(in, HEADER_TRANSDUCER, features)) { + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + } else { + throw std::runtime_error("Unable to read transducer header!"); + } + + read_le_64(in); // total size + initial = read_le_64(in); + uint64_t state_count = read_le_64(in); + uint64_t final_count = read_le_64(in); + uint64_t trans_count = read_le_64(in); + + if (transitions.size() > state_count) { + transitions.clear(); + // if transitions.size() <= state_count, they'll get cleared + // when we read in the offsets, so don't bother here + } + finals.clear(); + + for (uint64_t i = 0; i < final_count; i++) { + uint64_t s = read_le_64(in); + double w = read_le_double(in); + finals.insert(make_pair(s, w)); + } + + vector offsets; + offsets.reserve(state_count+1); + for (uint64_t i = 0; i < state_count; i++) { + transitions[i].clear(); + offsets.push_back(read_le_64(in)); + } + offsets.push_back(read_le_64(in)); + + uint64_t state = 0; + for (uint64_t i = 0; i < trans_count; i++) { + while (i == offsets[state+1]) { + state++; + } + int32_t isym = read_le_s32(in); + int32_t osym = read_le_s32(in); + int32_t sym = alpha(isym, osym); + uint64_t dest = read_le_64(in); + double wght = read_le_double(in); + transitions[state].insert(make_pair(sym, make_pair(dest, wght))); + } +} + +void +Transducer::write_mmap(FILE* out, const Alphabet& alpha) const +{ + fwrite_unlocked(HEADER_TRANSDUCER, 1, 4, out); + uint64_t features = 0; + features |= TDF_WEIGHTS; + features |= TDF_MMAP; + write_le_64(out, features); + + uint64_t tr_count = 0; + vector offsets; + offsets.reserve(transitions.size()+1); + for (auto& it : transitions) { + offsets.push_back(tr_count); + tr_count += it.second.size(); + } + offsets.push_back(tr_count); + + // TODO: which things should be smaller than u64? + + uint64_t total_size = + ( transitions.size() + 1 + // offset of each state + (tr_count * 3) + // each transition + (finals.size() * 2) + // final states + 4 ); // initial state + length of each section + + write_le_64(out, total_size*8); // number of bytes after this + write_le_64(out, initial); // initial state + write_le_64(out, transitions.size()); // number of states + write_le_64(out, finals.size()); // number of finals + write_le_64(out, tr_count); // number of transitions + + for (auto& it : finals) { + write_le_64(out, it.first); + write_le_double(out, it.second); + } + + for (auto& it : offsets) { + write_le_64(out, it); + } + + for (auto& it : transitions) { + // we want to make sure the transitions are sorted by input symbol + map> symbols; + for (auto& it2 : it.second) { + symbols[alpha.decode(it2.first).first].insert(it2.first); + } + for (auto& s_in : symbols) { + for (auto& s : s_in.second) { + auto range = it.second.equal_range(s); + for (auto tr = range.first; tr != range.second; ++tr) { + auto sym = alpha.decode(tr->first); + write_le_s32(out, sym.first); // input symbol + write_le_s32(out, sym.second); // output symbol + write_le_64(out, tr->second.first); // destination + write_le_double(out, tr->second.second); // weight + } + } + } + } +} + void Transducer::serialise(std::ostream &serialised) const { @@ -682,6 +789,27 @@ Transducer::deserialise(std::istream &serialised) transitions = Deserialiser > > >::deserialise(serialised); } +void +Transducer::read_serialised(FILE* in) +{ + initial = OldBinary::read_int(in, false); + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { + int s = OldBinary::read_int(in, false); + finals.insert(make_pair(s, OldBinary::read_double(in, false))); + } + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { + int src = OldBinary::read_int(in, false); + multimap > st; + for (uint64_t j = OldBinary::read_int(in, false); j > 0; j--) { + int sym = OldBinary::read_int(in, false); + int dest = OldBinary::read_int(in, false); + double w = OldBinary::read_double(in, false); + st.insert(make_pair(sym, make_pair(dest, w))); + } + transitions.insert(make_pair(src, st)); + } +} + void Transducer::copy(Transducer const &t) { diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 8c1aeded..c7ec51c0 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -336,9 +336,14 @@ class Transducer */ void read(FILE *input, int const decalage = 0); + void read_mmap(FILE* input, Alphabet& alpha); + void write_mmap(FILE* output, const Alphabet& alpha) const; + void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); + void read_serialised(FILE* in); + /** * Insert another transducer into this, unifying source and targets. * Does not minimize. diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc new file mode 100644 index 00000000..8a14da17 --- /dev/null +++ b/lttoolbox/transducer_exe.cc @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include +#include +#include + +// includes needed for reading non-mmap files +#include +#include +#include + +TransducerExe::TransducerExe() : + initial(0), state_count(0), final_count(0), transition_count(0), + finals(nullptr), offsets(nullptr), transitions(nullptr) +{} + +TransducerExe::~TransducerExe() +{ + if (!mmapping) { + delete[] finals; + delete[] offsets; + delete[] transitions; + } +} + +void +TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) +{ + bool read_weights = false; // only matters for pre-mmap + fpos_t pos; + fgetpos(input, &pos); + char header[4]{}; + fread_unlocked(header, 1, 4, input); + if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = OldBinary::read_u64(input); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + read_weights = (features & TDF_WEIGHTS); + } else { + // no header + fsetpos(input, &pos); + } + + initial = OldBinary::read_int(input, true); + final_count = OldBinary::read_int(input, true); + + uint64_t base_state = 0; + double base_weight = 0.0; + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + base_state += OldBinary::read_int(input, true); + if (read_weights) { + base_weight += OldBinary::read_double(input, true); + } + finals[i].state = base_state; + finals[i].weight = base_weight; + } + + state_count = OldBinary::read_int(input, true); + offsets = new uint64_t[state_count+1]; + transition_count = 0; + std::vector isyms, osyms; + std::vector dests; + std::vector weights; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = transition_count; + std::map>>> temp; + uint64_t count = OldBinary::read_int(input, true); + transition_count += count; + int32_t tag_base = 0; + for (uint64_t t = 0; t < count; t++) { + tag_base += OldBinary::read_int(input, true); + if (match) { + tag_base -= alphabet.size(); + } + uint64_t dest = (i + OldBinary::read_int(input, true)) % state_count; + if (read_weights) { + base_weight = OldBinary::read_double(input, true); + } + if (match) { + temp[tag_base].push_back(std::make_pair(tag_base, + std::make_pair(dest, + base_weight))); + } else { + auto sym = alphabet.decode(tag_base); + temp[sym.first].push_back(std::make_pair(sym.second, + std::make_pair(dest, + base_weight))); + } + } + for (auto& it : temp) { + for (auto& it2 : it.second) { + isyms.push_back(it.first); + osyms.push_back(it2.first); + dests.push_back(it2.second.first); + weights.push_back(it2.second.second); + } + } + } + offsets[state_count] = transition_count; + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = isyms[i]; + transitions[i].osym = osyms[i]; + transitions[i].dest = dests[i]; + transitions[i].weight = weights[i]; + } +} + +void +TransducerExe::read_serialised(FILE* input, Alphabet& alphabet, bool match) +{ + initial = OldBinary::read_int(input, false); + final_count = OldBinary::read_int(input, false); + + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + finals[i].state = OldBinary::read_int(input, false); + finals[i].weight = OldBinary::read_double(input, false); + } + + state_count = OldBinary::read_int(input, false); + offsets = new uint64_t[state_count+1]; + transition_count = 0; + std::vector isyms, osyms; + std::vector dests; + std::vector weights; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = transition_count; + std::map>>> temp; + OldBinary::read_int(input, false); // src state, should == i + uint64_t count = OldBinary::read_int(input, false); + transition_count += count; + for (uint64_t t = 0; t < count; t++) { + int32_t tag = OldBinary::read_int(input, false); + uint64_t dest = OldBinary::read_int(input, false); + double weight = OldBinary::read_double(input, false); + if (match) { + temp[tag].push_back(std::make_pair(tag, std::make_pair(dest, weight))); + } else { + auto sym = alphabet.decode(tag); + temp[sym.first].push_back(std::make_pair(sym.second, + std::make_pair(dest, weight))); + } + } + for (auto& it : temp) { + for (auto& it2 : it.second) { + isyms.push_back(it.first); + osyms.push_back(it2.first); + dests.push_back(it2.second.first); + weights.push_back(it2.second.second); + } + } + } + offsets[state_count] = transition_count; + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = isyms[i]; + transitions[i].osym = osyms[i]; + transitions[i].dest = dests[i]; + transitions[i].weight = weights[i]; + } +} + +void +TransducerExe::read(FILE* input) +{ + fpos_t pos; + fgetpos(input, &pos); + char header[4]{}; + auto l = fread_unlocked(header, 1, 4, input); + if (l == 4 && strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = read_le_64(input); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + } else { + throw std::runtime_error("Unable to read transducer header!"); + } + + read_le_64(input); // total size + initial = read_le_64(input); + state_count = read_le_64(input); + final_count = read_le_64(input); + transition_count = read_le_64(input); + + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + finals[i].state = read_le_64(input); + finals[i].weight = read_le_double(input); + } + + offsets = new uint64_t[state_count+1]; + for (uint64_t i = 0; i < state_count+1; i++) { + offsets[i] = read_le_64(input); + } + + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = read_le_s32(input); + transitions[i].osym = read_le_s32(input); + transitions[i].dest = read_le_64(input); + transitions[i].weight = read_le_double(input); + } +} + +void* +TransducerExe::init(void* ptr) +{ + mmapping = true; + + ptr += 4 + sizeof(uint64_t); // skip header + uint64_t* arr = reinterpret_cast(ptr); + uint64_t total_size = arr[0]; + initial = arr[1]; + state_count = arr[2]; + final_count = arr[3]; + transition_count = arr[4]; + ptr += sizeof(uint64_t)*5; + + finals = reinterpret_cast(ptr); + ptr += sizeof(Final)*final_count; + + offsets = reinterpret_cast(ptr); + ptr += sizeof(uint64_t)*(state_count+1); + + transitions = reinterpret_cast(ptr); + ptr += sizeof(Transition)*transition_count; + + return ptr; +} + +void +TransducerExe::get_range(const uint64_t state, const int32_t symbol, + uint64_t& start, uint64_t& end) +{ + uint64_t l = offsets[state]; + uint64_t r = offsets[state+1]; + uint64_t m; + if (l == r) { + start = end = 0; + return; + } + while (l < r) { + m = (l + r) / 2; + if (transitions[m].isym < symbol) { + l = m + 1; + } else { + r = m; + } + } + if (transitions[l].isym != symbol) { + end = start = 0; + return; + } else { + start = l; + } + // there's probably a way to do this with 1 loop + // but I'd have to be very sure of what I was doing to write that loop -DGS + l = start; + r = offsets[state+1]; + while (l < r) { + m = (l + r) / 2; + if (transitions[m].isym > symbol) { + r = m; + } else { + l = m + 1; + } + } + end = l; +} + +bool +TransducerExe::find_final(const uint64_t state, double& weight) +{ + int64_t l = 0; + int64_t r = final_count - 1; + int64_t m; + while (l <= r) { + m = (l + r) / 2; + if (finals[m].state == state) { + weight = finals[m].weight; + return true; + } else if (finals[m].state < state) { + l = m + 1; + } else { + r = m - 1; + } + } + return false; +} + +bool +TransducerExe::is_final(const uint64_t state) +{ + double x; + return find_final(state, x); +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h new file mode 100644 index 00000000..4fa1d77a --- /dev/null +++ b/lttoolbox/transducer_exe.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_TRANSDUCER_EXE_ +#define _LT_TRANSDUCER_EXE_ + +#include +#include + +// only needed for reading non-mmap files +#include + +struct Transition { + int32_t isym; + int32_t osym; + uint64_t dest; + double weight; +}; + +struct Final { + uint64_t state; + double weight; +}; + +class MatchState2; +class State; + +class TransducerExe { + friend MatchState2; + friend State; +private: + uint64_t initial; + uint64_t state_count; + uint64_t final_count; + uint64_t transition_count; + Final* finals; + uint64_t* offsets; + Transition* transitions; + bool mmapping = false; + + void get_range(const uint64_t state, const int32_t sym, + uint64_t& start, uint64_t& end); + bool find_final(const uint64_t state, double& weight); + bool is_final(const uint64_t state); +public: + TransducerExe(); + ~TransducerExe(); + void read_compressed(FILE* input, Alphabet& alphabet, bool match = false); + void read_serialised(FILE* input, Alphabet& alphabet, bool match = false); + void read(FILE* input); + void* init(void* ptr); +}; + +#endif diff --git a/lttoolbox/ustring.cc b/lttoolbox/ustring.cc index 87056c2c..daac05e2 100644 --- a/lttoolbox/ustring.cc +++ b/lttoolbox/ustring.cc @@ -48,7 +48,7 @@ to_ustring(const uint8_t* s) } void -ustring_to_vec32(const UString& str, std::vector& vec) +ustring_to_vec32(UString_view str, std::vector& vec) { if (str.empty()) { return; diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 51ec0ed5..a7c60504 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -25,8 +25,10 @@ #include #include #include +#include typedef std::basic_string UString; +typedef std::basic_string_view UString_view; void write(const UString& str, UFILE* output); @@ -34,7 +36,7 @@ UString to_ustring(const char* str); UString to_ustring(const uint8_t* str); // append UTF-16 string to UTF-32 vector of symbols -void ustring_to_vec32(const UString& str, std::vector& vec); +void ustring_to_vec32(UString_view str, std::vector& vec); inline std::ostream& operator<<(std::ostream& ostr, char16_t c) @@ -44,7 +46,7 @@ operator<<(std::ostream& ostr, char16_t c) } inline std::ostream& -operator<<(std::ostream& ostr, const UString& str) +operator<<(std::ostream& ostr, UString_view str) { std::string res; utf8::utf16to8(str.begin(), str.end(), std::back_inserter(res)); diff --git a/python/setup.py.in b/python/setup.py.in index 3d3bdad2..c0180b18 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -6,7 +6,7 @@ Setup for SWIG Python bindings for lttoolbox from distutils.core import Extension, setup from sys import platform -compile_args = '@CXXFLAGS@'.split() + '@ICU_CFLAGS@'.split() +compile_args = '@CXXFLAGS@'.split() + '@ICU_CFLAGS@'.split() + ['@have_sv@'] link_args = [] if platform == 'darwin': compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7']