From 3ffdbefa697d1e9745f3f588430374a2753f683d Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 26 Jul 2021 12:46:18 -0500 Subject: [PATCH 01/35] manager for string constants --- lttoolbox/Makefile.am | 4 +-- lttoolbox/compression.h | 6 ++-- lttoolbox/string_writer.cc | 69 ++++++++++++++++++++++++++++++++++++++ lttoolbox/string_writer.h | 36 ++++++++++++++++++++ lttoolbox/ustring.h | 4 ++- 5 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 lttoolbox/string_writer.cc create mode 100644 lttoolbox/string_writer.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 2fd56b04..e943b392 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -2,13 +2,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_writer.h \ transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 21ca48fd..5783f774 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -29,7 +29,8 @@ using namespace std; // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; enum LT_FEATURES : uint64_t { - LTF_UNKNOWN = (1ull << 0), // Features >= this are unknown, so throw an error; Inc this if more features are added + LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format + LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits }; @@ -37,7 +38,8 @@ enum LT_FEATURES : uint64_t { constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; enum TD_FEATURES : uint64_t { TDF_WEIGHTS = (1ull << 0), - TDF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added + TDF_MMAP = (1ull << 1), + TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits }; diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc new file mode 100644 index 00000000..292431f7 --- /dev/null +++ b/lttoolbox/string_writer.cc @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +UString_view +StringWriter::add(const UString& s) +{ + auto start = buffer.find(s); + if (start == UString::npos) { + start = buffer.size(); + buffer += s; + } + UString_view ret(buffer); + return ret.substr(start, s.size()); +} + +UString_view +StringWriter::get(const uint32_t start, const uint32_t count) +{ + UString_view ret(buffer); + return ret.substr(start, count); +} + +void +StringWriter::read(FILE* in) +{ + uint64_t len = read_u64_le(in); + buffer.clear(); + buffer.reserve(len); + uint8_t temp[len*2]{}; + if (fread_unlocked(&temp, 1, len*2, in) != len) { + throw std::runtime_error("Failed to read strings"); + } + uint16_t c; + for (uint64_t i = 0; i < len*2; i += 2) { + buffer += static_cast(temp[i] | (temp[i+1] << 8)); + } +} + +void +StringWriter::write(FILE* out) +{ + write_u64_le(out, buffer.size()); + uint8_t temp[buffer.size()*2]{}; + for (uint64_t i = 0; i < buffer.size(); i++) { + temp[2*i] = buffer[i] & 0xFF; + temp[2*i+1] = (buffer[i] >> 8) & 0xFF; + } + if (fwrite_unlocked(&temp, 1, buffer.size()*2, out) != buffer.size()*2) { + throw std::runtime_error("Failed to write strings"); + } +} diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h new file mode 100644 index 00000000..d7b23346 --- /dev/null +++ b/lttoolbox/string_writer.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_STRING_WRITER_ +#define _LT_STRING_WRITER_ + +// TODO: merge compression.h write_u64_le() and friends to here +// when we drop compressed formats +#include +#include +#include + +class StringWriter { +public: + UString buffer; + UString_view add(const UString& s); + UString_view get(const uint32_t start, const uint32_t count); + void read(FILE* in); + void write(FILE* out); +}; + +#endif diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index fa01e316..3642cbe4 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -24,8 +24,10 @@ #include #include #include +#include typedef std::basic_string UString; +typedef std::basic_string_view UString_view; void write(const UString& str, UFILE* output); @@ -43,7 +45,7 @@ operator<<(std::ostream& ostr, char16_t c) } inline std::ostream& -operator<<(std::ostream& ostr, const UString& str) +operator<<(std::ostream& ostr, UString_view str) { std::string res; utf8::utf16to8(str.begin(), str.end(), std::back_inserter(res)); From 1be7e67a086b145569012b28bce3b0b811bc0aaf Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 26 Jul 2021 15:18:10 -0500 Subject: [PATCH 02/35] transducer read/write functions --- lttoolbox/transducer.cc | 95 +++++++++++++++++++++++++++++++++++++++++ lttoolbox/transducer.h | 3 ++ 2 files changed, 98 insertions(+) diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index e9e64547..b2816466 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -656,6 +656,101 @@ Transducer::read(FILE *input, int const decalage) *this = new_t; } +void +Transducer::read_mmap(FILE* in, Alphabet& alpha) +{ + read_le(in); // total size + initial = read_le(in); + uint64_t state_count = read_le(in); + uint64_t final_count = read_le(in); + uint64_t trans_count = read_le(in); + + if (transitions.size() > state_count) { + transitions.clear(); + // if transitions.size() <= state_count, they'll get cleared + // when we read in the offsets, so don't bother here + } + finals.clear(); + + for (uint64_t i = 0; i < final_count; i++) { + uint64_t s = read_le(in); + uint64_t w = read_le(in); + finals.insert(make_pair(s, *reinterpret_cast(&w))); + } + + vector offsets; + offsets.reserve(state_count); + for (uint64_t i = 0; i < state_count; i++) { + transitions[i].clear(); + offsets.push_back(read_le(in)); + } + offsets.push_back(0); + + uint64_t state = 0; + for (uint64_t i = 0; i < trans_count; i++) { + if (i == offsets[state+1]) { + state++; + } + uint64_t isym = read_le(in); + uint64_t osym = read_le(in); + int32_t sym = alpha((int32_t)isym, (int32_t)osym); + uint64_t dest = read_le(in); + uint64_t wght = read_le(in); + transitions[state].insert(make_pair(sym, make_pair(dest, wght))); + } +} + +void +Transducer::write_mmap(FILE* out, const Alphabet& alpha) +{ + fwrite_unlocked(HEADER_TRANSDUCER, 1, 4, out); + uint64_t features = 0; + features |= TDF_WEIGHTS; + features |= TDF_MMAP; + write_le(out, features); + + uint64_t tr_count = 0; + vector offsets; + offsets.reserve(transitions.size()); + for (auto& it : transitions) { + offsets.push_back(tr_count); + tr_count += it.second.size(); + } + + // TODO: which things should be smaller than u64? + + uint64_t total_size = + ( transitions.size() + // offset of each state + (tr_count * 4) + // each transition + (finals.size() * 2) + // final states + 4 ); // initial state + length of each section + + write_le(out, total_size*8); // number of bytes after this + write_le(out, initial); // initial state + write_le(out, transitions.size()); // number of states + write_le(out, finals.size()); // number of finals + write_le(out, tr_count); // number of transitions + + for (auto& it : finals) { + write_le(out, it.first); + write_le(out, *reinterpret_cast(&it.second)); + } + + for (auto& it : offsets) { + write_le(out, it); + } + + for (auto& it : transitions) { + for (auto& it2 : it.second) { + auto sym = alpha.decode(it2.first); + write_le(out, sym.first); // input symbol + write_le(out, sym.second); // output symbol + write_le(out, it2.second.first); // destination + write_le(out, *reinterpret_cast(&it2.second.second)); // weight + } + } +} + void Transducer::serialise(std::ostream &serialised) const { diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 3dd91d42..89b0d9f9 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -340,6 +340,9 @@ class Transducer */ void read(FILE *input, int const decalage = 0); + void read_mmap(FILE* input, Alphabet& alpha); + void write_mmap(FILE* output, const Alphabet& alpha); + void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); From 0789bb7c9b07c1dbcf0fb937e3f9f96165c86031 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 27 Jul 2021 09:45:23 -0500 Subject: [PATCH 03/35] string_view for old compilers --- configure.ac | 2 +- lttoolbox/string_view.h | 40 ++++++++++++++++++++++++++++++++++++++++ lttoolbox/ustring.h | 2 +- 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 lttoolbox/string_view.h diff --git a/configure.ac b/configure.ac index 05fce75b..96baab2e 100644 --- a/configure.ac +++ b/configure.ac @@ -46,7 +46,7 @@ AC_CHECK_LIB(xml2, xmlReaderForFile) # Checks for header files. AC_HEADER_STDC -AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h]) +AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h string_view]) AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) # Checks for typedefs, structures, and compiler characteristics. diff --git a/lttoolbox/string_view.h b/lttoolbox/string_view.h new file mode 100644 index 00000000..f5bf267b --- /dev/null +++ b/lttoolbox/string_view.h @@ -0,0 +1,40 @@ +/* +* Copyright (C) 2021 Apertium +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +*/ + +#pragma once +#ifndef LT_STRING_VIEW_HPP__ +#define LT_STRING_VIEW_HPP__ + +#ifdef HAVE_STRING_VIEW + #include +#else + #include + #include + + namespace std { + using string_view = ::std::experimental::string_view; + template + using basic_string_view = ::std::experimental::basic_string_view; + + inline ::std::string& operator+=(::std::string& str, ::std::string_view sv) { + str.append(sv.begin(), sv.end()); + return str; + } + } +#endif + +#endif diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 3642cbe4..5ffa878f 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include typedef std::basic_string UString; typedef std::basic_string_view UString_view; From c7859922658ee9371edd5b809fa297521c47a3fb Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 27 Jul 2021 10:57:03 -0500 Subject: [PATCH 04/35] work on python bindings --- python/setup.py.in | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index 14f64ab9..0dbb4ab6 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -3,36 +3,26 @@ """ Setup for SWIG Python bindings for lttoolbox """ -from os import path +from sys import platform from distutils.core import Extension, setup -from distutils.command.build import build +import shlex - -class CustomBuild(build): - sub_commands = [ - ('build_ext', build.has_ext_modules), - ('build_py', build.has_pure_modules), - ('build_clib', build.has_c_libraries), - ('build_scripts', build.has_scripts), - ] - - -def get_sources(): - sources = ['lttoolbox.i'] - cc_sources = ['alphabet.cc', 'compression.cc', 'fst_processor.cc', 'lt_locale.cc', - 'node.cc', 'state.cc', 'trans_exe.cc', 'xml_parse_util.cc'] - rel_path = '@top_srcdir@/lttoolbox/' - sources.extend(path.join(rel_path, f) for f in cc_sources) - return sources +compile_args = ['-std=c++0x'] + shlex.split('@DEFS@') + '@LIBXML_CFLAGS@'.split() +link_args = ['-lxml2'] +if platform == 'darwin': + compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] + link_args.append('-mmacosx-version-min=10.7') lttoolbox_module = Extension( name='_lttoolbox', - sources=get_sources(), + language = 'c++', + sources=['lttoolbox.i'], swig_opts = ["-c++", "-I@top_srcdir@", "-Wall"], - include_dirs=['@top_srcdir@', '/usr/include/libxml2'], - library_dirs=['/usr/include/libxml2'], - extra_compile_args='@CXXFLAGS@'.split(), - extra_link_args=['-lxml2'], + include_dirs=['@top_srcdir@', '@top_srcdir@/lttoolbox'], + library_dirs=['@top_srcdir@/lttoolbox/.libs'], + libraries = ['lttoolbox'], + extra_compile_args = compile_args, + extra_link_args = link_args ) setup( @@ -44,7 +34,8 @@ setup( author_email='@PACKAGE_BUGREPORT@', license='GPL-3.0+', maintainer_email='@PACKAGE_BUGREPORT@', - cmdclass={'build': CustomBuild}, ext_modules=[lttoolbox_module], py_modules=['lttoolbox'], +# packages = ['lttoolbox'], + data_files = [] ) From 5e758ee5fe977bb873cec60b1644add23e9d40e3 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 27 Jul 2021 12:33:42 -0500 Subject: [PATCH 05/35] get it working --- python/setup.py.in | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index 0dbb4ab6..d6d76580 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -7,7 +7,7 @@ from sys import platform from distutils.core import Extension, setup import shlex -compile_args = ['-std=c++0x'] + shlex.split('@DEFS@') + '@LIBXML_CFLAGS@'.split() +compile_args = '@CXXFLAGS@'.split() + shlex.split('@DEFS@') + '@LIBXML_CFLAGS@'.split() link_args = ['-lxml2'] if platform == 'darwin': compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] @@ -20,7 +20,7 @@ lttoolbox_module = Extension( swig_opts = ["-c++", "-I@top_srcdir@", "-Wall"], include_dirs=['@top_srcdir@', '@top_srcdir@/lttoolbox'], library_dirs=['@top_srcdir@/lttoolbox/.libs'], - libraries = ['lttoolbox'], + libraries = ['lttoolbox@VERSION_MAJOR@'], extra_compile_args = compile_args, extra_link_args = link_args ) @@ -36,6 +36,5 @@ setup( maintainer_email='@PACKAGE_BUGREPORT@', ext_modules=[lttoolbox_module], py_modules=['lttoolbox'], -# packages = ['lttoolbox'], data_files = [] ) From fe8b858f7535ce080783d80756e10ec71c847c3d Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 27 Jul 2021 12:50:04 -0500 Subject: [PATCH 06/35] remove unneeded args --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index d6d76580..2303a7a7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -8,7 +8,7 @@ from distutils.core import Extension, setup import shlex compile_args = '@CXXFLAGS@'.split() + shlex.split('@DEFS@') + '@LIBXML_CFLAGS@'.split() -link_args = ['-lxml2'] +link_args = [] if platform == 'darwin': compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] link_args.append('-mmacosx-version-min=10.7') From 836c97ab26fc7ed39960839840d32b36df1aa78b Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 27 Jul 2021 13:22:46 -0500 Subject: [PATCH 07/35] link to ICU so that it actually runs --- python/setup.py.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index 2303a7a7..b2f64e70 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -7,8 +7,8 @@ from sys import platform from distutils.core import Extension, setup import shlex -compile_args = '@CXXFLAGS@'.split() + shlex.split('@DEFS@') + '@LIBXML_CFLAGS@'.split() -link_args = [] +compile_args = '@CXXFLAGS@'.split() + shlex.split('@DEFS@') + '@LIBXML_CFLAGS@'.split() + '@ICU_CFLAGS@'.split() +link_args = '@LIBS@'.split() + '@ICU_LIBS@'.split() if platform == 'darwin': compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] link_args.append('-mmacosx-version-min=10.7') From 161206455661b697b239d68d72eac48e18e682d5 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 29 Jul 2021 14:00:59 -0500 Subject: [PATCH 08/35] transducer mmap sort by input symbol --- lttoolbox/Makefile.am | 2 +- lttoolbox/transducer.cc | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index e943b392..b2875a75 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -2,7 +2,7 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_writer.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index b2816466..73913187 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -741,12 +741,23 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) } for (auto& it : transitions) { + // we want to make sure the transitions are sorted by input symbol + map> symbols; for (auto& it2 : it.second) { - auto sym = alpha.decode(it2.first); - write_le(out, sym.first); // input symbol - write_le(out, sym.second); // output symbol - write_le(out, it2.second.first); // destination - write_le(out, *reinterpret_cast(&it2.second.second)); // weight + symbols[alpha.decode(it2.first).first].insert(it2.first); + } + for (auto& s_in : symbols) { + for (auto& s : s_in.second) { + auto range = it.second.equal_range(s); + for (auto tr = range.first; tr != range.second; ++tr) { + auto sym = alpha.decode(tr->first); + write_le(out, sym.first); // input symbol + write_le(out, sym.second); // output symbol + write_le(out, tr->second.first); // destination + uint64_t w = *reinterpret_cast(&tr->second.second); + write_le(out, w); // weight + } + } } } } From 644f9b1f86ea0c3f0694d6d77a369422718ad306 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 29 Jul 2021 19:19:35 -0500 Subject: [PATCH 09/35] class for executing transducers --- lttoolbox/Makefile.am | 4 +- lttoolbox/compression.h | 10 +++ lttoolbox/transducer.cc | 9 +-- lttoolbox/transducer_exe.cc | 143 ++++++++++++++++++++++++++++++++++++ lttoolbox/transducer_exe.h | 49 ++++++++++++ 5 files changed, 208 insertions(+), 7 deletions(-) create mode 100644 lttoolbox/transducer_exe.cc create mode 100644 lttoolbox/transducer_exe.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index b2875a75..fa5f3f4f 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -3,12 +3,12 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ - transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ + transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 5783f774..339b9493 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -77,6 +77,11 @@ inline auto write_le(Stream& out, uint64_t value) { return write_u64_le(out, value); } +template +inline auto write_double_le(Stream& out, double value) { + return write_u64_le(out, *reinterpret_cast(&value)); +} + inline auto read_u64(FILE *in) { uint64_t value = 0; @@ -123,6 +128,11 @@ inline auto read_le(std::istream& in) { return read_le(in, Value{}); } +inline double read_double_le(FILE* in) { + uint64_t val = read_le(in); + return *reinterpret_cast(&val); +} + /** * Clase "Compression". * Class methods to access compressed data by the byte-aligned method diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 73913187..eecce470 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -674,8 +674,8 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) for (uint64_t i = 0; i < final_count; i++) { uint64_t s = read_le(in); - uint64_t w = read_le(in); - finals.insert(make_pair(s, *reinterpret_cast(&w))); + double w = read_double_le(in); + finals.insert(make_pair(s, w)); } vector offsets; @@ -695,7 +695,7 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) uint64_t osym = read_le(in); int32_t sym = alpha((int32_t)isym, (int32_t)osym); uint64_t dest = read_le(in); - uint64_t wght = read_le(in); + double wght = read_double_le(in); transitions[state].insert(make_pair(sym, make_pair(dest, wght))); } } @@ -754,8 +754,7 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) write_le(out, sym.first); // input symbol write_le(out, sym.second); // output symbol write_le(out, tr->second.first); // destination - uint64_t w = *reinterpret_cast(&tr->second.second); - write_le(out, w); // weight + write_double_le(out, tr->second.second); // weight } } } diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc new file mode 100644 index 00000000..47f08d64 --- /dev/null +++ b/lttoolbox/transducer_exe.cc @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +// includes needed for reading non-mmap files +#include +#include +#include + +TransducerExe::TransducerExe() : + initial(0), state_count(0), final_count(0), transition_count(0), + finals(nullptr), offsets(nullptr), transitions(nullptr) +{} + +TransducerExe::~TransducerExe() +{ + delete[] finals; + delete[] offsets; + delete[] transitions; +} + +void +TransducerExe::read(FILE* input, Alphabet& alphabet) +{ + bool read_weights = false; // only matters for pre-mmap + bool mmap = false; + fpos_t pos; + fgetpos(input, &pos); + char header[4]{}; + fread_unlocked(header, 1, 4, input); + if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = read_le(input); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + read_weights = (features & TDF_WEIGHTS); + mmap = (features & TDF_MMAP); + } else { + // no header + fsetpos(input, &pos); + } + + if (mmap) { + read_le(input); // total size + initial = read_le(input); + state_count = read_le(input); + final_count = read_le(input); + transition_count = read_le(input); + + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + finals[i].state = read_le(input); + finals[i].weight = read_double_le(input); + } + + offsets = new uint64_t[state_count+1]; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = read_le(input); + } + offsets[state_count] = transition_count; + + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = read_le(input); + transitions[i].osym = read_le(input); + transitions[i].dest = read_le(input); + transitions[i].weight = read_double_le(input); + } + } else { + initial = Compression::multibyte_read(input); + final_count = Compression::multibyte_read(input); + + uint64_t base_state = 0; + double base_weight = 0.0; + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + base_state += Compression::multibyte_read(input); + if (read_weights) { + base_weight += Compression::long_multibyte_read(input); + } + finals[i].state = base_state; + finals[i].weight = base_weight; + } + + state_count = Compression::multibyte_read(input); + offsets = new uint64_t[state_count+1]; + transition_count = 0; + std::vector isyms, osyms, dests; + std::vector weights; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = transition_count; + std::map>>> temp; + uint64_t count = Compression::multibyte_read(input); + transition_count += count; + int32_t tag_base = 0; + for (uint64_t i = 0; i < count; i++) { + tag_base += Compression::multibyte_read(input); + uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; + if (read_weights) { + base_weight = Compression::multibyte_read(input); + } + auto sym = alphabet.decode(tag_base); + temp[sym.first].push_back(make_pair(sym.second, + make_pair(dest, base_weight))); + } + for (auto& it : temp) { + for (auto& it2 : it.second) { + isyms.push_back(it.first); + osyms.push_back(it2.first); + dests.push_back(it2.second.first); + weights.push_back(it2.second.second); + } + } + } + offsets[state_count] = transition_count; + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = isyms[i]; + transitions[i].osym = osyms[i]; + transitions[i].dest = dests[i]; + transitions[i].weight = weights[i]; + } + } +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h new file mode 100644 index 00000000..87412967 --- /dev/null +++ b/lttoolbox/transducer_exe.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +// only needed for reading non-mmap files +#include + +struct Transition { + uint64_t isym; // TODO: should be int32_t + uint64_t osym; + uint64_t dest; + double weight; +}; + +struct Final { + uint64_t state; + double weight; +}; + +class TransducerExe { +private: + uint64_t initial; + uint64_t state_count; + uint64_t final_count; + uint64_t transition_count; + Final* finals; + uint64_t* offsets; + Transition* transitions; +public: + TransducerExe(); + ~TransducerExe(); + void read(FILE* input, Alphabet& alphabet); +}; From aa0fac9c936440d3ee2db75e194cd21a3151cc09 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 30 Jul 2021 10:27:19 -0500 Subject: [PATCH 10/35] move endian helpers for mmap to their own header --- lttoolbox/Makefile.am | 2 +- lttoolbox/compression.h | 10 ---- lttoolbox/endian_util.h | 114 ++++++++++++++++++++++++++++++++++++ lttoolbox/string_writer.cc | 5 +- lttoolbox/string_writer.h | 4 +- lttoolbox/transducer.cc | 53 +++++++++-------- lttoolbox/transducer_exe.cc | 25 ++++---- lttoolbox/transducer_exe.h | 4 +- 8 files changed, 161 insertions(+), 56 deletions(-) create mode 100644 lttoolbox/endian_util.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index fa5f3f4f..7904c991 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,6 +1,6 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ - deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ + deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 339b9493..5783f774 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -77,11 +77,6 @@ inline auto write_le(Stream& out, uint64_t value) { return write_u64_le(out, value); } -template -inline auto write_double_le(Stream& out, double value) { - return write_u64_le(out, *reinterpret_cast(&value)); -} - inline auto read_u64(FILE *in) { uint64_t value = 0; @@ -128,11 +123,6 @@ inline auto read_le(std::istream& in) { return read_le(in, Value{}); } -inline double read_double_le(FILE* in) { - uint64_t val = read_le(in); - return *reinterpret_cast(&val); -} - /** * Clase "Compression". * Class methods to access compressed data by the byte-aligned method diff --git a/lttoolbox/endian_util.h b/lttoolbox/endian_util.h new file mode 100644 index 00000000..069bd74f --- /dev/null +++ b/lttoolbox/endian_util.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_ENDIAN_UTIL_ +#define _LT_ENDIAN_UTIL_ + +#include +#include +#include +#include + +inline uint32_t to_le_32(uint32_t v) { + return (((v & 0xFF) << 24) | + ((v & 0xFF00) << 8) | + ((v & 0xFF0000) >> 8) | + ((v & 0xFF000000) >> 24)); +} + +inline uint32_t from_le_32(uint32_t v) { + return (((v & 0xFF000000) >> 24) | + ((v & 0xFF0000) >> 8) | + ((v & 0xFF00) << 8) | + ((v & 0xFF) << 24)); +} + +inline uint64_t to_le_64(uint64_t v) { + return (((v & 0xFF) << 56) | + ((v & 0xFF00) << 40) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF00000000000000) >> 56)); +} + +inline uint64_t from_le_64(uint64_t v) { + return (((v & 0xFF00000000000000) >> 56) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF00) << 40) | + ((v & 0xFF) << 56)); +} + +inline auto write_le_32(FILE* out, uint32_t value) { + uint32_t v = to_le_32(value); + auto rv = fwrite_unlocked(reinterpret_cast(&v), 1, sizeof(value), out); + if (rv != sizeof(value)) { + throw std::runtime_error("Failed to write uint32_t"); + } + return rv; +} + +inline auto write_le_64(FILE* out, uint64_t value) { + uint64_t v = to_le_64(value); + auto rv = fwrite_unlocked(reinterpret_cast(&v), 1, sizeof(value), out); + if (rv != sizeof(value)) { + throw std::runtime_error("Failed to write uint64_t"); + } + return rv; +} + +inline auto read_le_32(FILE* in) { + uint32_t value = 0; + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + throw std::runtime_error("Failed to read uint64_t"); + } + return from_le_32(value); +} + +inline auto read_le_64(FILE* in) { + uint64_t value = 0; + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + throw std::runtime_error("Failed to read uint64_t"); + } + return from_le_64(value); +} + +inline auto write_le_s32(FILE* out, int32_t value) { + return write_le_32(out, *reinterpret_cast(&value)); +} + +inline auto read_le_s32(FILE* in) { + uint32_t val = read_le_32(in); + return *reinterpret_cast(&val); +} + +inline auto write_le_double(FILE* out, double value) { + return write_le_64(out, *reinterpret_cast(&value)); +} + +inline auto read_le_double(FILE* in) { + uint64_t val = read_le_64(in); + return *reinterpret_cast(&val); +} + +#endif diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 292431f7..64c71208 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -17,6 +17,7 @@ #include +#include #include UString_view @@ -41,7 +42,7 @@ StringWriter::get(const uint32_t start, const uint32_t count) void StringWriter::read(FILE* in) { - uint64_t len = read_u64_le(in); + uint64_t len = read_le_64(in); buffer.clear(); buffer.reserve(len); uint8_t temp[len*2]{}; @@ -57,7 +58,7 @@ StringWriter::read(FILE* in) void StringWriter::write(FILE* out) { - write_u64_le(out, buffer.size()); + write_le_64(out, buffer.size()); uint8_t temp[buffer.size()*2]{}; for (uint64_t i = 0; i < buffer.size(); i++) { temp[2*i] = buffer[i] & 0xFF; diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index d7b23346..2785c42e 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -18,11 +18,9 @@ #ifndef _LT_STRING_WRITER_ #define _LT_STRING_WRITER_ -// TODO: merge compression.h write_u64_le() and friends to here -// when we drop compressed formats -#include #include #include +#include class StringWriter { public: diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index eecce470..fdd41668 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -659,11 +660,11 @@ Transducer::read(FILE *input, int const decalage) void Transducer::read_mmap(FILE* in, Alphabet& alpha) { - read_le(in); // total size - initial = read_le(in); - uint64_t state_count = read_le(in); - uint64_t final_count = read_le(in); - uint64_t trans_count = read_le(in); + read_le_64(in); // total size + initial = read_le_64(in); + uint64_t state_count = read_le_64(in); + uint64_t final_count = read_le_64(in); + uint64_t trans_count = read_le_64(in); if (transitions.size() > state_count) { transitions.clear(); @@ -673,8 +674,8 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) finals.clear(); for (uint64_t i = 0; i < final_count; i++) { - uint64_t s = read_le(in); - double w = read_double_le(in); + uint64_t s = read_le_64(in); + double w = read_le_double(in); finals.insert(make_pair(s, w)); } @@ -682,7 +683,7 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) offsets.reserve(state_count); for (uint64_t i = 0; i < state_count; i++) { transitions[i].clear(); - offsets.push_back(read_le(in)); + offsets.push_back(read_le_64(in)); } offsets.push_back(0); @@ -691,11 +692,11 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) if (i == offsets[state+1]) { state++; } - uint64_t isym = read_le(in); - uint64_t osym = read_le(in); - int32_t sym = alpha((int32_t)isym, (int32_t)osym); - uint64_t dest = read_le(in); - double wght = read_double_le(in); + int32_t isym = read_le_s32(in); + int32_t osym = read_le_s32(in); + int32_t sym = alpha(isym, osym); + uint64_t dest = read_le_64(in); + double wght = read_le_double(in); transitions[state].insert(make_pair(sym, make_pair(dest, wght))); } } @@ -707,7 +708,7 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) uint64_t features = 0; features |= TDF_WEIGHTS; features |= TDF_MMAP; - write_le(out, features); + write_le_64(out, features); uint64_t tr_count = 0; vector offsets; @@ -725,19 +726,19 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) (finals.size() * 2) + // final states 4 ); // initial state + length of each section - write_le(out, total_size*8); // number of bytes after this - write_le(out, initial); // initial state - write_le(out, transitions.size()); // number of states - write_le(out, finals.size()); // number of finals - write_le(out, tr_count); // number of transitions + write_le_64(out, total_size*8); // number of bytes after this + write_le_64(out, initial); // initial state + write_le_64(out, transitions.size()); // number of states + write_le_64(out, finals.size()); // number of finals + write_le_64(out, tr_count); // number of transitions for (auto& it : finals) { - write_le(out, it.first); - write_le(out, *reinterpret_cast(&it.second)); + write_le_64(out, it.first); + write_le_double(out, it.second); } for (auto& it : offsets) { - write_le(out, it); + write_le_64(out, it); } for (auto& it : transitions) { @@ -751,10 +752,10 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) auto range = it.second.equal_range(s); for (auto tr = range.first; tr != range.second; ++tr) { auto sym = alpha.decode(tr->first); - write_le(out, sym.first); // input symbol - write_le(out, sym.second); // output symbol - write_le(out, tr->second.first); // destination - write_double_le(out, tr->second.second); // weight + write_le_s32(out, sym.first); // input symbol + write_le_s32(out, sym.second); // output symbol + write_le_64(out, tr->second.first); // destination + write_le_double(out, tr->second.second); // weight } } } diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 47f08d64..5efdb402 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -18,6 +18,7 @@ #include #include +#include // includes needed for reading non-mmap files #include @@ -58,30 +59,30 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } if (mmap) { - read_le(input); // total size - initial = read_le(input); - state_count = read_le(input); - final_count = read_le(input); - transition_count = read_le(input); + read_le_64(input); // total size + initial = read_le_64(input); + state_count = read_le_64(input); + final_count = read_le_64(input); + transition_count = read_le_64(input); finals = new Final[final_count]; for (uint64_t i = 0; i < final_count; i++) { - finals[i].state = read_le(input); - finals[i].weight = read_double_le(input); + finals[i].state = read_le_64(input); + finals[i].weight = read_le_double(input); } offsets = new uint64_t[state_count+1]; for (uint64_t i = 0; i < state_count; i++) { - offsets[i] = read_le(input); + offsets[i] = read_le_64(input); } offsets[state_count] = transition_count; transitions = new Transition[transition_count]; for (uint64_t i = 0; i < transition_count; i++) { - transitions[i].isym = read_le(input); - transitions[i].osym = read_le(input); - transitions[i].dest = read_le(input); - transitions[i].weight = read_double_le(input); + transitions[i].isym = read_le_s32(input); + transitions[i].osym = read_le_s32(input); + transitions[i].dest = read_le_64(input); + transitions[i].weight = read_le_double(input); } } else { initial = Compression::multibyte_read(input); diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index 87412967..02297b18 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -22,8 +22,8 @@ #include struct Transition { - uint64_t isym; // TODO: should be int32_t - uint64_t osym; + int32_t isym; + int32_t osym; uint64_t dest; double weight; }; From aeb0fe921d269135d530ec911af33ba5f1c612eb Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 30 Jul 2021 12:06:11 -0500 Subject: [PATCH 11/35] run TransducerExe for matching --- lttoolbox/Makefile.am | 4 +- lttoolbox/match_state2.cc | 138 ++++++++++++++++++++++++++++++++++++ lttoolbox/match_state2.h | 54 ++++++++++++++ lttoolbox/transducer_exe.cc | 40 +++++++++++ lttoolbox/transducer_exe.h | 8 +++ 5 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 lttoolbox/match_state2.cc create mode 100644 lttoolbox/match_state2.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 7904c991..75aa96c2 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,13 +1,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ - match_node.cc match_state.cc node.cc pattern_list.cc \ + match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc diff --git a/lttoolbox/match_state2.cc b/lttoolbox/match_state2.cc new file mode 100644 index 00000000..3738fd32 --- /dev/null +++ b/lttoolbox/match_state2.cc @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +MatchState2::MatchState2(TransducerExe* t) : + trans(t) +{ + buffer[0] = trans->initial; + last = 1; +} + +MatchState2::~MatchState2() +{} + +void +MatchState2::copy(const MatchState2& o) +{ + trans = o.trans; + first = o.first; + last = o.last; + for (uint16_t i = first; i != last; i = (i + 1) % BUF_LIMIT) { + buffer[i] = o.buffer[i]; + } +} + +MatchState2::MatchState2(const MatchState2& o) +{ + copy(o); +} + +MatchState2& +MatchState2::operator=(const MatchState2& o) +{ + if (this != &o) { + copy(o); + } + return *this; +} + +uint16_t +MatchState2::size() const +{ + return (last + BUF_LIMIT - first) % BUF_LIMIT; +} + +bool +MatchState2::empty() const +{ + return last == first; +} + +void +MatchState2::applySymbol(const uint64_t state, const int32_t symbol) +{ + uint64_t start = 0; + uint64_t end = 0; + trans->get_range(state, symbol, start, end); + for (uint64_t i = start; i < end; i++) { + buffer[last] = trans->transitions[i].dest; + last = (last + 1) % BUF_LIMIT; + } +} + +void +MatchState2::step(const int32_t input) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + } + first = temp_last; +} + +void +MatchState2::step(const int32_t input, const int32_t alt) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + applySymbol(buffer[i], alt); + } + first = temp_last; +} + +void +MatchState2::step(UString_view input, const Alphabet& alpha, bool foldcase) +{ + // TODO +} + +int +MatchState2::classifyFinals(const std::map& finals, + const std::set& banned_rules) const +{ + int ret = INT_MAX; + for (uint16_t i = first; i != last; i = (i+1)%BUF_LIMIT) { + auto it = finals.find(buffer[i]); + if (it != finals.end()) { + if (it->second < ret && + banned_rules.find(it->second) == banned_rules.end()) { + ret = it->second; + } + } + } + return (ret < INT_MAX) ? ret : -1; +} + +int +MatchState2::classifyFinals(const std::map& finals) const +{ + set empty; + return classifyFinals(finals, empty); +} + +void +MatchState2::clear() +{ + first = 0; + last = 1; + buffer[0] = trans->initial; +} diff --git a/lttoolbox/match_state2.h b/lttoolbox/match_state2.h new file mode 100644 index 00000000..1542c053 --- /dev/null +++ b/lttoolbox/match_state2.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_MATCH_STATE_ +#define _LT_MATCH_STATE_ + +#include +#include +#include + +// rename upon deleting old MatchState +class MatchState2 +{ +private: + static int const BUF_LIMIT = 1024; + TransducerExe* trans; + uint64_t buffer[1024]; + uint16_t first = 0; + uint16_t last = 0; + + void copy(const MatchState2& o); + void applySymbol(const uint64_t state, const int32_t symbol); +public: + MatchState2(TransducerExe* t); + ~MatchState2(); + MatchState2(const MatchState2& o); + MatchState2& operator=(const MatchState2& o); + + uint16_t size() const; + bool empty() const; + void step(const int32_t input); + void step(const int32_t input, const int32_t alt); + void step(UString_view input, const Alphabet& alpha, bool foldcase = true); + int classifyFinals(const std::map& finals, + const std::set& banned_rules) const; + int classifyFinals(const std::map& finals) const; + void clear(); +}; + +#endif diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 5efdb402..fbd6dad3 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -142,3 +142,43 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } } } + +void +TransducerExe::get_range(const uint64_t state, const int32_t symbol, + uint64_t& start, uint64_t& end) +{ + uint64_t l = offsets[state]; + uint64_t r = offsets[state+1]; + uint64_t m; + if (l == r) { + start = end = 0; + return; + } + while (l < r) { + m = (l + r) / 2; + if (transitions[m].isym < symbol) { + l = m + 1; + } else { + r = m; + } + } + if (transitions[l].isym != symbol) { + end = start = 0; + return; + } else { + start = l; + } + // there's probably a way to do this with 1 loop + // but I'd have to be very sure of what I was doing to write that loop -DGS + l = start; + r = offsets[state+1]; + while (l < r) { + m = (l + r) / 2; + if (transitions[m].isym < symbol) { + r = m; + } else { + l = m + 1; + } + } + end = l; +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index 02297b18..be4f3ae9 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -33,7 +33,12 @@ struct Final { double weight; }; +class MatchState2; +class TransState; + class TransducerExe { + friend MatchState2; + friend TransState; private: uint64_t initial; uint64_t state_count; @@ -42,6 +47,9 @@ class TransducerExe { Final* finals; uint64_t* offsets; Transition* transitions; + + void get_range(const uint64_t state, const int32_t sym, + uint64_t& start, uint64_t& end); public: TransducerExe(); ~TransducerExe(); From cbc3272feaf35055fb3449ff9fce2262a0bdc0fd Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 30 Jul 2021 16:28:29 -0500 Subject: [PATCH 12/35] use new TransducerExe in lt-proc --- lttoolbox/fst_processor.cc | 57 ++++---- lttoolbox/fst_processor.h | 14 +- lttoolbox/state.cc | 259 ++++++++++++++++-------------------- lttoolbox/state.h | 32 +++-- lttoolbox/transducer_exe.cc | 40 +++++- lttoolbox/transducer_exe.h | 11 +- 6 files changed, 211 insertions(+), 202 deletions(-) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 732acc8b..2e4ac97d 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -741,11 +741,12 @@ FSTProcessor::combineWblanks() void FSTProcessor::calcInitial() { + set temp; for(auto& it : transducers) { - root.addTransition(0, 0, it.second.getInitial(), default_weight); + temp.insert(&it.second); } - initial_state.init(&root); + initial_state.init(temp); } bool @@ -767,23 +768,19 @@ FSTProcessor::classifyFinals() for(auto& it : transducers) { if(endsWith(it.first, "@inconditional"_u)) { - inconditional.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + inconditional.insert(&it.second); } else if(endsWith(it.first, "@standard"_u)) { - standard.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + standard.insert(&it.second); } else if(endsWith(it.first, "@postblank"_u)) { - postblank.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + postblank.insert(&it.second); } else if(endsWith(it.first, "@preblank"_u)) { - preblank.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + preblank.insert(&it.second); } else { @@ -930,6 +927,7 @@ FSTProcessor::isAlphabetic(UChar32 const c) const void FSTProcessor::load(FILE *input) { + bool mmap = false; fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; @@ -939,6 +937,7 @@ FSTProcessor::load(FILE *input) if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } else { // Old binary format @@ -946,24 +945,26 @@ FSTProcessor::load(FILE *input) } } - // letters - int len = Compression::multibyte_read(input); - while(len > 0) - { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); - len--; - } + if (mmap) { + } else { + + // letters + int len = Compression::multibyte_read(input); + while(len > 0) { + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + len--; + } - // symbols - alphabet.read(input); + // symbols + alphabet.read(input); - len = Compression::multibyte_read(input); + len = Compression::multibyte_read(input); - while(len > 0) - { - UString name = Compression::string_read(input); - transducers[name].read(input, alphabet); - len--; + while(len > 0) { + UString name = Compression::string_read(input); + transducers[name].read(input, alphabet); + len--; + } } } @@ -984,8 +985,7 @@ FSTProcessor::initTMAnalysis() calcInitial(); for(auto& it : transducers) { - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + all_finals.insert(&it.second); } } @@ -995,8 +995,7 @@ FSTProcessor::initGeneration() setIgnoredChars(false); calcInitial(); for(auto& it : transducers) { - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + all_finals.insert(&it.second); } } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 32263ac7..9bfe0bca 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -57,7 +57,7 @@ class FSTProcessor /** * Transducers in FSTP */ - map transducers; + map transducers; /** * Current state of lexical analysis @@ -77,27 +77,27 @@ class FSTProcessor /** * The final states of inconditional sections in the dictionaries */ - map inconditional; + set inconditional; /** * The final states of standard sections in the dictionaries */ - map standard; + set standard; /** * The final states of postblank sections in the dictionaries */ - map postblank; + set postblank; /** * The final states of preblank sections in the dictionaries */ - map preblank; + set preblank; /** * Merge of 'inconditional', 'standard', 'postblank' and 'preblank' sets */ - map all_finals; + set all_finals; /** * Queue of blanks, used in reading methods diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index facd5374..be492bce 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -17,7 +17,6 @@ #include #include -#include #include //debug// @@ -26,8 +25,7 @@ //debug// State::State() -{ -} +{} State::~State() { @@ -42,21 +40,15 @@ State::State(State const &s) State & State::operator =(State const &s) { - if(this != &s) - { - destroy(); - copy(s); - } - + copy(s); return *this; } void State::destroy() { - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - delete state[i].sequence; + for (auto& it : state) { + delete it.sequence; } state.clear(); @@ -65,19 +57,17 @@ State::destroy() void State::copy(State const &s) { - // release references - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - delete state[i].sequence; + if (this == &s) { + return; } + destroy(); state = s.state; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - vector> *tmp = new vector>(); - *tmp = *(state[i].sequence); - state[i].sequence = tmp; + for (auto& it : state) { + TPath* tmp = new TPath(); + *tmp = *(it.sequence); + it.sequence = tmp; } } @@ -88,63 +78,62 @@ State::size() const } void -State::init(Node *initial) +State::init(const set& exes) { - state.clear(); - state.push_back(TNodeState(initial, new vector>(), false)); - state[0].sequence->clear(); + destroy(); + for (auto& it : exes) { + state.push_back(TNodeState(it, it->initial, new TPath(), false)); + } epsilonClosure(); } bool -State::apply_into(vector* new_state, int const input, int index, bool dirty) -{ - map::const_iterator it; - it = state[index].where->transitions.find(input); - if(it != state[index].where->transitions.end()) - { - for(int j = 0; j != it->second.size; j++) - { - vector> *new_v = new vector>(); - *new_v = *(state[index].sequence); - if(it->first != 0) - { - new_v->push_back(make_pair(it->second.out_tag[j], it->second.out_weight[j])); - } - new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); +State::apply_into(std::vector* new_state, const int32_t input, + int index, bool dirty) +{ + uint64_t start, end; + bool any = false; + TransducerExe* trans = state[index].where; + trans->get_range(state[index].state, input, start, end); + for (uint64_t i = start; i < end; i++) { + TPath* new_v = new TPath(); + *new_v = *(state[index].sequence); + if (input != 0) { + new_v->push_back(make_pair(trans->transitions[i].osym, + trans->transitions[i].weight)); } - return true; + new_state->push_back(TNodeState(trans, trans->transitions[i].dest, new_v, + state[index].dirty || dirty)); + any = true; } - return false; + return any; } bool -State::apply_into_override(vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty) -{ - map::const_iterator it; - it = state[index].where->transitions.find(input); - if(it != state[index].where->transitions.end()) - { - for(int j = 0; j != it->second.size; j++) - { - vector> *new_v = new vector>(); - *new_v = *(state[index].sequence); - if(it->first != 0) - { - if(it->second.out_tag[j] == old_sym) - { - new_v->push_back(make_pair(new_sym, it->second.out_weight[j])); - } - else - { - new_v->push_back(make_pair(it->second.out_tag[j], it->second.out_weight[j])); - } +State::apply_into_override(std::vector* new_state, + const int32_t input, + const int32_t old_sym, const int32_t new_sym, + int index, bool dirty) +{ + uint64_t start, end; + bool any = false; + TransducerExe* trans = state[index].where; + trans->get_range(state[index].state, input, start, end); + for (uint64_t i = start; i < end; i++) { + TPath* new_v = new TPath(); + *new_v = *(state[index].sequence); + if (input != 0) { + int32_t s = trans->transitions[i].osym; + if (s == old_sym) { + s = new_sym; } - new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); + new_v->push_back(make_pair(s, trans->transitions[i].weight)); } - return true; + new_state->push_back(TNodeState(trans, trans->transitions[i].dest, new_v, + state[index].dirty || dirty)); + any = true; } - return false; + return any; } void @@ -269,20 +258,18 @@ State::epsilonClosure() { for(size_t i = 0; i != state.size(); i++) { - map::iterator it2; - it2 = state[i].where->transitions.find(0); - if(it2 != state[i].where->transitions.end()) - { - for(int j = 0 ; j != it2->second.size; j++) - { - vector> *tmp = new vector>(); - *tmp = *(state[i].sequence); - if(it2->second.out_tag[j] != 0) - { - tmp->push_back(make_pair(it2->second.out_tag[j], it2->second.out_weight[j])); - } - state.push_back(TNodeState(it2->second.dest[j], tmp, state[i].dirty)); + TransducerExe* trans = state[i].where; + uint64_t start, end; + trans->get_range(state[i].state, 0, start, end); + for (uint64_t j = start; j < end; j++) { + TPath* tmp = new TPath(); + *tmp = *(state[i].sequence); + if (trans->transitions[j].osym != 0) { + tmp->push_back(make_pair(trans->transitions[j].osym, + trans->transitions[j].weight)); } + state.push_back(TNodeState(trans, trans->transitions[j].dest, tmp, + state[i].dirty)); } } } @@ -426,12 +413,10 @@ State::step_case(UChar32 val, bool caseSensitive) bool -State::isFinal(map const &finals) const +State::isFinal(const set& finals) const { - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) - { + for (auto& it : state) { + if(finals.find(it.where) != finals.end() && it.where->is_final(it.state)) { return true; } } @@ -466,7 +451,7 @@ State::NFinals(vector> lf, int maxAnalyses, int maxWeightC UString -State::filterFinals(map const &finals, +State::filterFinals(const set& finals, Alphabet const &alphabet, set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, @@ -477,23 +462,22 @@ State::filterFinals(map const &finals, UString result; double cost = 0.0000; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { - if(state[i].dirty) + if(st.dirty) { result.clear(); cost = 0.0000; unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); - cost += ((*(state[i].sequence))[j]).second; + alphabet.getSymbol(result, ((*(st.sequence))[j]).first, uppercase); + cost += ((*(st.sequence))[j]).second; } if(firstupper) { @@ -512,19 +496,21 @@ State::filterFinals(map const &finals, { result.clear(); cost = 0.0000; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first); - cost += ((*(state[i].sequence))[j]).second; + alphabet.getSymbol(result, ((*(st.sequence))[j]).first); + cost += ((*(st.sequence))[j]).second; } } // Add the weight of the final state - cost += (*(finals.find(state[i].where))).second; + double temp; + st.where->find_final(st.state, temp); + cost += temp; response.push_back(make_pair(result, cost)); } } @@ -550,7 +536,7 @@ State::filterFinals(map const &finals, set > > -State::filterFinalsLRX(map const &finals, +State::filterFinalsLRX(const set& finals, Alphabet const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const @@ -560,21 +546,20 @@ State::filterFinalsLRX(map const &finals, vector current_result; UString rule_id; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { current_result.clear(); rule_id.clear(); UString current_word; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { current_word += '\\'; } UString sym; - alphabet.getSymbol(sym, ((*(state[i].sequence))[j]).first, uppercase); + alphabet.getSymbol(sym, ((*(st.sequence))[j]).first, uppercase); if(sym == "<$>"_u) { if(!current_word.empty()) @@ -598,7 +583,7 @@ State::filterFinalsLRX(map const &finals, UString -State::filterFinalsSAO(map const &finals, +State::filterFinalsSAO(const set& finals, Alphabet const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const @@ -606,29 +591,28 @@ State::filterFinalsSAO(map const &finals, UString result; UString annot; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { result += '/'; unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - if(alphabet.isTag(((*(state[i].sequence))[j]).first)) + if(alphabet.isTag(((*(st.sequence))[j]).first)) { annot.clear(); - alphabet.getSymbol(annot, ((*(state[i].sequence))[j]).first); + alphabet.getSymbol(annot, ((*(st.sequence))[j]).first); result += '&'; result += annot.substr(1,annot.length()-2); result += ';'; } else { - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); + alphabet.getSymbol(result, ((*(st.sequence))[j]).first, uppercase); } } if(firstupper) @@ -650,25 +634,24 @@ State::filterFinalsSAO(map const &finals, } UString -State::filterFinalsTM(map const &finals, +State::filterFinalsTM(const set& finals, Alphabet const &alphabet, set const &escaped_chars, queue &blankqueue, vector &numbers) const { UString result; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { result += '/'; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find((*(state[i].sequence))[j].first) != escaped_chars.end()) + if(escaped_chars.find((*(st.sequence))[j].first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, (*(state[i].sequence))[j].first); + alphabet.getSymbol(result, (*(st.sequence))[j].first); } } } @@ -854,34 +837,22 @@ State::lastPartHasRequiredSymbol(const vector> &seq, int requi void -State::restartFinals(const map &finals, int requiredSymbol, State *restart_state, int separationSymbol) +State::restartFinals(const set& finals, int requiredSymbol, State *restart_state, int separationSymbol) { - - for(unsigned int i=0; i 0) - { - bool restart = lastPartHasRequiredSymbol(*(state_i.sequence), requiredSymbol, separationSymbol); - if(restart) - { - if(restart_state != NULL) - { - for(unsigned int j=0; jstate.size(); j++) - { - TNodeState initst = restart_state->state.at(j); - vector> *tnvec = new vector>; - - for(unsigned int k=0; k < state_i.sequence->size(); k++) - { - tnvec->push_back(state_i.sequence->at(k)); - } - TNodeState tn(initst.where, tnvec, state_i.dirty); - tn.sequence->push_back(make_pair(separationSymbol, 0.0000)); - state.push_back(tn); + if (finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { + bool restart = lastPartHasRequiredSymbol(*(st.sequence), requiredSymbol, separationSymbol); + if(restart && restart_state != NULL) { + for (auto& initst : restart_state->state) { + TPath* tnvec = new TPath(); + for (auto& it : *(st.sequence)) { + tnvec->push_back(it); } + TNodeState tn(initst.where, initst.state, tnvec, st.dirty); + tn.sequence->push_back(make_pair(separationSymbol, 0.0000)); + state.push_back(tn); } } } diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 31f0e42d..7d8c9734 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -29,11 +29,13 @@ #include #include #include - +#include #include using namespace std; +typedef vector> TPath; + /** * Class to represent the current state of transducer processing */ @@ -45,22 +47,24 @@ class State */ struct TNodeState { - Node *where; - vector> *sequence; + TransducerExe* where; + uint64_t state; + TPath* sequence; // a state is "dirty" if it was introduced at runtime (case variants, etc.) bool dirty; - TNodeState(Node * const &w, vector> * const &s, bool const &d): where(w), sequence(s), dirty(d){} + TNodeState(TransducerExe* w, uint64_t i, TPath* s, bool d) + : where(w), state(i), sequence(s), dirty(d){} TNodeState(const TNodeState& other) - : where(other.where) - , sequence(other.sequence) - , dirty(other.dirty) + : where(other.where), state(other.state), + sequence(other.sequence), dirty(other.dirty) {} TNodeState & operator=(TNodeState const &other) { where = other.where; + state = other.state; sequence = other.sequence; dirty = other.dirty; return *this; @@ -204,7 +208,7 @@ class State * Init the state with the initial node and empty output * @param initial the initial node of the transducer */ - void init(Node *initial); + void init(const set& exes); /** * Remove states not containing a specific symbol in their last 'part', and states @@ -254,7 +258,7 @@ class State * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinals(map const &finals, + UString filterFinals(const set& finals, Alphabet const &a, set const &escaped_chars, bool display_weights = false, @@ -275,7 +279,7 @@ class State * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinalsSAO(map const &finals, + UString filterFinalsSAO(const set& finals, Alphabet const &a, set const &escaped_chars, bool uppercase = false, @@ -295,7 +299,7 @@ class State * @return the result of the transduction */ - set > > filterFinalsLRX(map const &finals, + set > > filterFinalsLRX(const set& finals, Alphabet const &a, set const &escaped_chars, bool uppercase = false, @@ -314,7 +318,7 @@ class State * @param restart_state * @param separationSymbol */ - void restartFinals(const map &finals, int requiredSymbol, State *restart_state, int separationSymbol); + void restartFinals(const set& finals, int requiredSymbol, State *restart_state, int separationSymbol); /** @@ -323,14 +327,14 @@ class State * @param finals set of final nodes @return * @true if the state is final */ - bool isFinal(map const &finals) const; + bool isFinal(const set& finals) const; /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ UString getReadableString(const Alphabet &a); - UString filterFinalsTM(map const &finals, + UString filterFinalsTM(const set& finals, Alphabet const &alphabet, set const &escaped_chars, queue &blanks, diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index fbd6dad3..fd69922a 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -103,21 +103,22 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) state_count = Compression::multibyte_read(input); offsets = new uint64_t[state_count+1]; transition_count = 0; - std::vector isyms, osyms, dests; + std::vector isyms, osyms; + std::vector dests; std::vector weights; for (uint64_t i = 0; i < state_count; i++) { offsets[i] = transition_count; - std::map>>> temp; uint64_t count = Compression::multibyte_read(input); transition_count += count; int32_t tag_base = 0; - for (uint64_t i = 0; i < count; i++) { + for (uint64_t t = 0; t < count; t++) { tag_base += Compression::multibyte_read(input); uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; if (read_weights) { - base_weight = Compression::multibyte_read(input); + base_weight = Compression::long_multibyte_read(input); } auto sym = alphabet.decode(tag_base); temp[sym.first].push_back(make_pair(sym.second, @@ -174,7 +175,7 @@ TransducerExe::get_range(const uint64_t state, const int32_t symbol, r = offsets[state+1]; while (l < r) { m = (l + r) / 2; - if (transitions[m].isym < symbol) { + if (transitions[m].isym > symbol) { r = m; } else { l = m + 1; @@ -182,3 +183,30 @@ TransducerExe::get_range(const uint64_t state, const int32_t symbol, } end = l; } + +bool +TransducerExe::find_final(const uint64_t state, double& weight) +{ + int64_t l = 0; + int64_t r = final_count - 1; + int64_t m; + while (l <= r) { + m = (l + r) / 2; + if (finals[m].state == state) { + weight = finals[m].weight; + return true; + } else if (finals[m].state < state) { + l = m + 1; + } else { + r = m - 1; + } + } + return false; +} + +bool +TransducerExe::is_final(const uint64_t state) +{ + double x; + return find_final(state, x); +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index be4f3ae9..eadf8942 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -15,6 +15,9 @@ * along with this program; if not, see . */ +#ifndef _LT_TRANSDUCER_EXE_ +#define _LT_TRANSDUCER_EXE_ + #include #include @@ -34,11 +37,11 @@ struct Final { }; class MatchState2; -class TransState; +class State; class TransducerExe { friend MatchState2; - friend TransState; + friend State; private: uint64_t initial; uint64_t state_count; @@ -50,8 +53,12 @@ class TransducerExe { void get_range(const uint64_t state, const int32_t sym, uint64_t& start, uint64_t& end); + bool find_final(const uint64_t state, double& weight); + bool is_final(const uint64_t state); public: TransducerExe(); ~TransducerExe(); void read(FILE* input, Alphabet& alphabet); }; + +#endif From a8b0acea7488fe0c94d5c9e5b50ea076bda7d35b Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 12:02:22 -0500 Subject: [PATCH 13/35] AlphabetExe --- lttoolbox/Makefile.am | 4 +- lttoolbox/alphabet.cc | 6 +++ lttoolbox/alphabet.h | 5 ++ lttoolbox/alphabet_exe.cc | 96 ++++++++++++++++++++++++++++++++++++++ lttoolbox/alphabet_exe.h | 40 ++++++++++++++++ lttoolbox/compiler.cc | 2 +- lttoolbox/fst_processor.cc | 13 ++++-- lttoolbox/fst_processor.h | 10 +++- lttoolbox/state.cc | 14 +++--- lttoolbox/state.h | 12 ++--- lttoolbox/string_writer.cc | 15 ++++-- lttoolbox/string_writer.h | 8 +++- 12 files changed, 199 insertions(+), 26 deletions(-) create mode 100644 lttoolbox/alphabet_exe.cc create mode 100644 lttoolbox/alphabet_exe.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 75aa96c2..ec07213a 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,11 +1,11 @@ -h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ +h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h -cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ +cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index a3138147..284e8b9c 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -316,3 +316,9 @@ Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, } } } + +vector& +Alphabet::getTags() +{ + return slexicinv; +} diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 8c6dec26..72a3b368 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -197,6 +197,11 @@ class Alphabet * @param nonTagsToo by default only tags are included, but if this is true we include all symbols */ void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); + + /** + * Return a reference to the array of tags + */ + vector& getTags(); }; #endif diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc new file mode 100644 index 00000000..b55e6420 --- /dev/null +++ b/lttoolbox/alphabet_exe.cc @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +AlphabetExe::AlphabetExe(StringWriter* sw_) + : sw(sw_), tag_count(0), tags(nullptr) +{} + +AlphabetExe::~AlphabetExe() +{ + delete[] tags; +} + +void +AlphabetExe::read(FILE* input, bool mmap) +{ + if (mmap) { + } else { + tag_count = Compression::multibyte_read(input); + tags = new StringRef[tag_count]; + for (uint32_t i = 0; i < tag_count; i++) { + UString tg; + tg += '<'; + tg += Compression::string_read(input); + tg += '>'; + tags[i] = sw->add(tg); + } + // has to be a separate loop, otherwise the string_views get + // invalidated when the StringWriter buffer expands + for (uint32_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + int pairs = Compression::multibyte_read(input); + for (int i = 0; i < pairs; i++) { + Compression::multibyte_read(input); + Compression::multibyte_read(input); + } + } +} + +int32_t +AlphabetExe::operator()(UString_view sv) +{ + auto it = symbol_map.find(sv); + if (it != symbol_map.end()) { + return it->second; + } else { + return 0; + } +} + +void +AlphabetExe::getSymbol(UString& result, int32_t symbol, bool uppercase) const +{ + if (symbol == 0) { + return; + } else if (symbol < 0) { + result.append(sw->get(tags[-symbol-1])); + } else if (uppercase) { + result += u_toupper(static_cast(symbol)); + } else { + result += static_cast(symbol); + } +} + +bool +AlphabetExe::isTag(const int32_t symbol) const +{ + return symbol < 0; +} + +void +AlphabetExe::clearSymbol(const int32_t symbol) +{ + if (symbol < 0) { + tags[-symbol-1].start = 0; + tags[-symbol-1].count = 0; + } +} diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h new file mode 100644 index 00000000..af579bb8 --- /dev/null +++ b/lttoolbox/alphabet_exe.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_ALPHABET_EXE_ +#define _LT_ALPHABET_EXE_ + +#include +#include + +class AlphabetExe { +private: + StringWriter* sw; + uint64_t tag_count; + StringRef* tags; + std::map symbol_map; +public: + AlphabetExe(StringWriter* sw_); + ~AlphabetExe(); + void read(FILE* in, bool mmap); + int32_t operator()(UString_view sv); + void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; + bool isTag(const int32_t symbol) const; + void clearSymbol(const int32_t symbol); +}; + +#endif diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index d2ab2341..128fac64 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -179,7 +179,7 @@ Compiler::procAlphabet() bool space = true; for(unsigned int i = 0; i < letters.length(); i++) { - if(!u_isspace(letters.at(i))) + if(!u_isspace(letters[i])) { space = false; break; diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 2e4ac97d..01e9822a 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -40,6 +40,7 @@ UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; FSTProcessor::FSTProcessor() + : alphabet(AlphabetExe(&str_write)) { // escaped_chars chars escaped_chars.insert('['); @@ -956,13 +957,17 @@ FSTProcessor::load(FILE *input) } // symbols - alphabet.read(input); + fgetpos(input, &pos); + alphabet.read(input, false); + fsetpos(input, &pos); + Alphabet temp; + temp.read(input); len = Compression::multibyte_read(input); while(len > 0) { UString name = Compression::string_read(input); - transducers[name].read(input, alphabet); + transducers[name].read(input, temp); len--; } } @@ -1067,7 +1072,7 @@ FSTProcessor::initDecompositionSymbols() } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, ""_u); + alphabet.clearSymbol(compoundOnlyLSymbol); } if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 @@ -1080,7 +1085,7 @@ FSTProcessor::initDecompositionSymbols() } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, ""_u); + alphabet.clearSymbol(compoundRSymbol); } } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 9bfe0bca..412bba22 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -19,10 +19,11 @@ #define _FSTPROCESSOR_ #include -#include +#include #include #include #include +#include #include #include #include @@ -134,10 +135,15 @@ class FSTProcessor */ int rcx_current_char; + /** + * String manager + */ + StringWriter str_write; + /** * Alphabet */ - Alphabet alphabet; + AlphabetExe alphabet; /** * Input buffer diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index be492bce..909fc6a7 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -452,7 +452,7 @@ State::NFinals(vector> lf, int maxAnalyses, int maxWeightC UString State::filterFinals(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar) const @@ -537,7 +537,7 @@ State::filterFinals(const set& finals, set > > State::filterFinalsLRX(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { @@ -584,7 +584,7 @@ State::filterFinalsLRX(const set& finals, UString State::filterFinalsSAO(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { @@ -635,7 +635,7 @@ State::filterFinalsSAO(const set& finals, UString State::filterFinalsTM(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, queue &blankqueue, vector &numbers) const { @@ -749,12 +749,12 @@ State::pruneCompounds(int requiredSymbol, int separationSymbol, int compound_max for(unsigned int i = 0; i> seq = *state.at(i).sequence; + vector> seq = *state[i].sequence; if(lastPartHasRequiredSymbol(seq, requiredSymbol, separationSymbol)) { int this_noOfCompoundElements = 0; - for (int j = seq.size()-2; j>0; j--) if ((seq.at(j)).first==separationSymbol) this_noOfCompoundElements++; + for (int j = seq.size()-2; j>0; j--) if ((seq[j]).first==separationSymbol) this_noOfCompoundElements++; noOfCompoundElements[i] = this_noOfCompoundElements; minNoOfCompoundElements = (minNoOfCompoundElements < this_noOfCompoundElements) ? minNoOfCompoundElements : this_noOfCompoundElements; @@ -862,7 +862,7 @@ State::restartFinals(const set& finals, int requiredSymbol, Stat UString -State::getReadableString(const Alphabet &a) +State::getReadableString(const AlphabetExe &a) { UString retval; retval += '['; diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 7d8c9734..676f7ebf 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -259,7 +259,7 @@ class State * @return the result of the transduction */ UString filterFinals(const set& finals, - Alphabet const &a, + AlphabetExe const &a, set const &escaped_chars, bool display_weights = false, int max_analyses = INT_MAX, @@ -280,7 +280,7 @@ class State * @return the result of the transduction */ UString filterFinalsSAO(const set& finals, - Alphabet const &a, + AlphabetExe const &a, set const &escaped_chars, bool uppercase = false, bool firstupper = false, @@ -300,7 +300,7 @@ class State */ set > > filterFinalsLRX(const set& finals, - Alphabet const &a, + AlphabetExe const &a, set const &escaped_chars, bool uppercase = false, bool firstupper = false, @@ -332,10 +332,10 @@ class State /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ - UString getReadableString(const Alphabet &a); + UString getReadableString(const AlphabetExe &a); UString filterFinalsTM(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, queue &blanks, vector &numbers) const; diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 64c71208..f818f07e 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -20,7 +20,7 @@ #include #include -UString_view +StringRef StringWriter::add(const UString& s) { auto start = buffer.find(s); @@ -28,8 +28,10 @@ StringWriter::add(const UString& s) start = buffer.size(); buffer += s; } - UString_view ret(buffer); - return ret.substr(start, s.size()); + StringRef ret; + ret.start = start; + ret.count = s.size(); + return ret; } UString_view @@ -39,6 +41,13 @@ StringWriter::get(const uint32_t start, const uint32_t count) return ret.substr(start, count); } +UString_view +StringWriter::get(const StringRef& ref) +{ + UString_view ret(buffer); + return ret.substr(ref.start, ref.count); +} + void StringWriter::read(FILE* in) { diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 2785c42e..15fcaf38 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -22,11 +22,17 @@ #include #include +struct StringRef { + uint32_t start; + uint32_t count; +}; + class StringWriter { public: UString buffer; - UString_view add(const UString& s); + StringRef add(const UString& s); UString_view get(const uint32_t start, const uint32_t count); + UString_view get(const StringRef& ref); void read(FILE* in); void write(FILE* out); }; From a1952107775bf1176efa6328e5d5b0f87b70b543 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 12:52:39 -0500 Subject: [PATCH 14/35] lt-comp working with new format --- lttoolbox/alphabet.cc | 25 +++++++++++++++++++++++++ lttoolbox/alphabet.h | 6 +++++- lttoolbox/alphabet_exe.cc | 8 ++++++++ lttoolbox/compiler.cc | 34 ++++++++++++++++++++++++---------- lttoolbox/fst_processor.cc | 20 ++++++++++++++++++++ lttoolbox/string_writer.cc | 2 +- lttoolbox/ustring.cc | 2 +- lttoolbox/ustring.h | 2 +- 8 files changed, 85 insertions(+), 14 deletions(-) diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 284e8b9c..d6ef296c 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -181,6 +182,30 @@ Alphabet::read(FILE *input) *this = a_new; } +void +Alphabet::write_mmap(FILE* output, StringWriter& sw) +{ + write_le_64(output, slexicinv.size()); + for (auto& it : slexicinv) { + StringRef r = sw.add(it); + write_le_32(output, r.start); + write_le_32(output, r.count); + } +} + +void +Alphabet::read_mmap(FILE* input, StringWriter& sw) +{ + int64_t count = read_le_64(input); + for (int64_t i = 0; i < count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString t = UString{sw.get(s, c)}; + slexicinv.push_back(t); + slexic[t] = -i-1; + } +} + void Alphabet::serialise(std::ostream &serialised) const { diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 72a3b368..f3140750 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -23,7 +23,8 @@ #include #include #include -#include "ustring.h" +#include +#include using namespace std; using namespace icu; @@ -135,6 +136,9 @@ class Alphabet */ void read(FILE *input); + void write_mmap(FILE* output, StringWriter& sw); + void read_mmap(FILE* input, StringWriter& sw); + void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index b55e6420..5f438651 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -18,6 +18,7 @@ #include #include +#include AlphabetExe::AlphabetExe(StringWriter* sw_) : sw(sw_), tag_count(0), tags(nullptr) @@ -32,6 +33,13 @@ void AlphabetExe::read(FILE* input, bool mmap) { if (mmap) { + tag_count = read_le_64(input); + tags = new StringRef[tag_count]; + for (uint64_t i = 0; i < tag_count; i++) { + tags[i].start = read_le_32(input); + tags[i].count = read_le_32(input); + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } } else { tag_count = Compression::multibyte_read(input); tags = new StringRef[tag_count]; diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 128fac64..761caf33 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -16,10 +16,12 @@ */ #include #include +#include #include #include #include #include +#include #include #include @@ -946,25 +948,37 @@ Compiler::write(FILE *output) { fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; - write_le(output, features); + features |= LTF_MMAP; + write_le_64(output, features); + + StringWriter sw; + StringRef letter_loc = sw.add(letters); + for (auto& it : alphabet.getTags()) { + sw.add(it); + } + for (auto& it : sections) { + sw.add(it.first); + } + + sw.write(output); // letters - Compression::string_write(letters, output); + write_le_32(output, letter_loc.start); + write_le_32(output, letter_loc.count); // symbols - alphabet.write(output); + alphabet.write_mmap(output, sw); // transducers - Compression::multibyte_write(sections.size(), output); + write_le_64(output, sections.size()); - int count=0; - for(auto& it : sections) - { - count++; + for(auto& it : sections) { cout << it.first << " " << it.second.size(); cout << " " << it.second.numberOfTransitions() << endl; - Compression::string_write(it.first, output); - it.second.write(output); + StringRef loc = sw.add(it.first); + write_le_32(output, loc.start); + write_le_32(output, loc.count); + it.second.write_mmap(output, alphabet); } } diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 01e9822a..1bb7a92a 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -16,6 +16,7 @@ */ #include #include +#include #include #include @@ -947,6 +948,25 @@ FSTProcessor::load(FILE *input) } if (mmap) { + str_write.read(input); + + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + vector vec; + ustring_to_vec32(str_write.get(s, c), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); + // alphabetic_chars + + alphabet.read(input, true); + + uint64_t tr_count = read_le_64(input); + Alphabet temp; + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{str_write.get(s, c)}; + transducers[name].read(input, temp); + } } else { // letters diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index f818f07e..4bc3d647 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -55,7 +55,7 @@ StringWriter::read(FILE* in) buffer.clear(); buffer.reserve(len); uint8_t temp[len*2]{}; - if (fread_unlocked(&temp, 1, len*2, in) != len) { + if (fread_unlocked(&temp, 1, len*2, in) != len*2) { throw std::runtime_error("Failed to read strings"); } uint16_t c; diff --git a/lttoolbox/ustring.cc b/lttoolbox/ustring.cc index 87056c2c..daac05e2 100644 --- a/lttoolbox/ustring.cc +++ b/lttoolbox/ustring.cc @@ -48,7 +48,7 @@ to_ustring(const uint8_t* s) } void -ustring_to_vec32(const UString& str, std::vector& vec) +ustring_to_vec32(UString_view str, std::vector& vec) { if (str.empty()) { return; diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 5ffa878f..0073fc48 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -35,7 +35,7 @@ UString to_ustring(const char* str); UString to_ustring(const uint8_t* str); // append UTF-16 string to UTF-32 vector of symbols -void ustring_to_vec32(const UString& str, std::vector& vec); +void ustring_to_vec32(UString_view str, std::vector& vec); inline std::ostream& operator<<(std::ostream& ostr, char16_t c) From 77a4a1ca6063c557f13d33ef7922797f246cebfa Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 13:49:59 -0500 Subject: [PATCH 15/35] lt-print and lt-trim accepting new format --- lttoolbox/lt_print.cc | 55 +++++++++++++++++++++++++++++------------ lttoolbox/lt_trim.cc | 49 +++++++++++++++++++++++++----------- lttoolbox/transducer.cc | 13 +++++++++- 3 files changed, 86 insertions(+), 31 deletions(-) diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index 8139e027..c2c40dba 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -16,6 +16,8 @@ */ #include #include +#include +#include #include #include @@ -128,10 +130,11 @@ int main(int argc, char *argv[]) } Alphabet alphabet; - set alphabetic_chars; + set alphabetic_chars; map transducers; + bool mmap = false; fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; @@ -141,6 +144,7 @@ int main(int argc, char *argv[]) if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } else { // Old binary format @@ -148,25 +152,44 @@ int main(int argc, char *argv[]) } } - // letters - int len = Compression::multibyte_read(input); - while(len > 0) - { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); - len--; - } + if (mmap) { + StringWriter sw; + sw.read(input); - // symbols - alphabet.read(input); + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + vector vec; + ustring_to_vec32(sw.get(s, c), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); - len = Compression::multibyte_read(input); + alphabet.read_mmap(input, sw); - while(len > 0) - { - UString name = Compression::string_read(input); - transducers[name].read(input); + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{sw.get(s, c)}; + transducers[name].read_mmap(input, alphabet); + } + } else { + // letters + int len = Compression::multibyte_read(input); + while(len > 0) { + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + len--; + } - len--; + // symbols + alphabet.read(input); + + len = Compression::multibyte_read(input); + + while(len > 0) { + UString name = Compression::string_read(input); + transducers[name].read(input); + + len--; + } } ///////////////////// diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index f685752b..e1e3dc35 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -16,6 +16,8 @@ */ #include #include +#include +#include #include #include @@ -44,6 +46,7 @@ read_fst(FILE *bin_file) std::map transducers; fpos_t pos; + bool mmap = false; if (fgetpos(bin_file, &pos) == 0) { char header[4]{}; fread_unlocked(header, 1, 4, bin_file); @@ -52,6 +55,7 @@ read_fst(FILE *bin_file) if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } else { // Old binary format @@ -59,26 +63,43 @@ read_fst(FILE *bin_file) } } - // letters - UString letters = Compression::string_read(bin_file); + UString letters; - // symbols - new_alphabet.read(bin_file); + if (mmap) { + StringWriter sw; + sw.read(bin_file); - int len = Compression::multibyte_read(bin_file); + uint32_t s = read_le_32(bin_file); + uint32_t c = read_le_32(bin_file); + letters = UString{sw.get(s, c)}; - while(len > 0) - { - UString name = Compression::string_read(bin_file); - transducers[name].read(bin_file); + new_alphabet.read_mmap(bin_file, sw); + + uint64_t tr_count = read_le_64(bin_file); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(bin_file); + uint32_t c = read_le_32(bin_file); + UString name = UString{sw.get(s, c)}; + transducers[name].read_mmap(bin_file, new_alphabet); + } + } else { + // letters + letters = Compression::string_read(bin_file); - len--; + // symbols + new_alphabet.read(bin_file); + + int len = Compression::multibyte_read(bin_file); + + while(len > 0) { + UString name = Compression::string_read(bin_file); + transducers[name].read(bin_file); + + len--; + } } - std::pair alph_letters; - alph_letters.first = new_alphabet; - alph_letters.second = letters; - return std::pair, std::map > (alph_letters, transducers); + return make_pair(make_pair(new_alphabet, letters), transducers); } std::pair, std::map > diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index fdd41668..d783c283 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -660,6 +660,17 @@ Transducer::read(FILE *input, int const decalage) void Transducer::read_mmap(FILE* in, Alphabet& alpha) { + char header[4]{}; + auto r = fread_unlocked(header, 1, 4, in); + if (r == 4 && strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = read_le_64(in); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + } else { + throw std::runtime_error("Unable to read transducer header!"); + } + read_le_64(in); // total size initial = read_le_64(in); uint64_t state_count = read_le_64(in); @@ -689,7 +700,7 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) uint64_t state = 0; for (uint64_t i = 0; i < trans_count; i++) { - if (i == offsets[state+1]) { + while (i == offsets[state+1]) { state++; } int32_t isym = read_le_s32(in); From 531e9aee9e6468a3bcd73741fe59f8f1effa78fa Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 14:35:38 -0500 Subject: [PATCH 16/35] helper functions continue to be fun --- lttoolbox/att_compiler.cc | 40 +++--------- lttoolbox/compiler.cc | 35 +---------- lttoolbox/lt_print.cc | 61 +----------------- lttoolbox/lt_trim.cc | 123 ++++++------------------------------- lttoolbox/string_writer.cc | 2 +- lttoolbox/string_writer.h | 2 +- lttoolbox/transducer.cc | 111 +++++++++++++++++++++++++++++++++ lttoolbox/transducer.h | 7 +++ 8 files changed, 148 insertions(+), 233 deletions(-) diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index eaa0dd87..184f62cb 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -435,37 +435,11 @@ AttCompiler::classify_backwards(int state, set& path) void AttCompiler::write(FILE *output) { -// FILE* output = fopen(file_name, "wb"); - fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); - uint64_t features = 0; - write_le(output, features); - - Transducer punct_fst = extract_transducer(PUNCT); - - /* Non-multichar symbols. */ - Compression::string_write(UString(letters.begin(), letters.end()), output); - /* Multichar symbols. */ - alphabet.write(output); - /* And now the FST. */ - if(punct_fst.numberOfTransitions() == 0) - { - Compression::multibyte_write(1, output); - } - else - { - Compression::multibyte_write(2, output); - } - Compression::string_write("main@standard"_u, output); - Transducer word_fst = extract_transducer(WORD); - word_fst.write(output); - cout << "main@standard" << " " << word_fst.size(); - cout << " " << word_fst.numberOfTransitions() << endl; - Compression::string_write("final@inconditional"_u, output); - if(punct_fst.numberOfTransitions() != 0) - { - punct_fst.write(output); - cout << "final@inconditional" << " " << punct_fst.size(); - cout << " " << punct_fst.numberOfTransitions() << endl; - } -// fclose(output); + UString letters = UString(letters.begin(), letters.end()); + map trans; + + trans["main@standard"_u] = extract_transducer(WORD); + trans["final@inconditional"_u] = extract_transducer(PUNCT); + + write_transducer_set(output, letters, alphabet, trans, true); } diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 761caf33..a7fc8fae 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -946,40 +946,7 @@ Compiler::procRegexp() void Compiler::write(FILE *output) { - fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); - uint64_t features = 0; - features |= LTF_MMAP; - write_le_64(output, features); - - StringWriter sw; - StringRef letter_loc = sw.add(letters); - for (auto& it : alphabet.getTags()) { - sw.add(it); - } - for (auto& it : sections) { - sw.add(it.first); - } - - sw.write(output); - - // letters - write_le_32(output, letter_loc.start); - write_le_32(output, letter_loc.count); - - // symbols - alphabet.write_mmap(output, sw); - - // transducers - write_le_64(output, sections.size()); - - for(auto& it : sections) { - cout << it.first << " " << it.second.size(); - cout << " " << it.second.numberOfTransitions() << endl; - StringRef loc = sw.add(it.first); - write_le_32(output, loc.start); - write_le_32(output, loc.count); - it.second.write_mmap(output, alphabet); - } + write_transducer_set(output, letters, alphabet, sections); } void diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index c2c40dba..5ee894bf 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -130,67 +130,10 @@ int main(int argc, char *argv[]) } Alphabet alphabet; - set alphabetic_chars; - + UString letters; map transducers; - bool mmap = false; - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(input); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(input, &pos); - } - } - - if (mmap) { - StringWriter sw; - sw.read(input); - - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - vector vec; - ustring_to_vec32(sw.get(s, c), vec); - alphabetic_chars.insert(vec.begin(), vec.end()); - - alphabet.read_mmap(input, sw); - - uint64_t tr_count = read_le_64(input); - for (uint64_t i = 0; i < tr_count; i++) { - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - UString name = UString{sw.get(s, c)}; - transducers[name].read_mmap(input, alphabet); - } - } else { - // letters - int len = Compression::multibyte_read(input); - while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); - len--; - } - - // symbols - alphabet.read(input); - - len = Compression::multibyte_read(input); - - while(len > 0) { - UString name = Compression::string_read(input); - transducers[name].read(input); - - len--; - } - } + read_transducer_set(input, letters, alphabet, transducers); ///////////////////// diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index e1e3dc35..75476654 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -38,79 +38,18 @@ void endProgram(char *name) exit(EXIT_FAILURE); } -std::pair, std::map > -read_fst(FILE *bin_file) -{ - Alphabet new_alphabet; - - std::map transducers; - - fpos_t pos; - bool mmap = false; - if (fgetpos(bin_file, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, bin_file); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(bin_file); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(bin_file, &pos); - } - } - - UString letters; - - if (mmap) { - StringWriter sw; - sw.read(bin_file); - - uint32_t s = read_le_32(bin_file); - uint32_t c = read_le_32(bin_file); - letters = UString{sw.get(s, c)}; - - new_alphabet.read_mmap(bin_file, sw); - - uint64_t tr_count = read_le_64(bin_file); - for (uint64_t i = 0; i < tr_count; i++) { - uint32_t s = read_le_32(bin_file); - uint32_t c = read_le_32(bin_file); - UString name = UString{sw.get(s, c)}; - transducers[name].read_mmap(bin_file, new_alphabet); - } - } else { - // letters - letters = Compression::string_read(bin_file); - - // symbols - new_alphabet.read(bin_file); - - int len = Compression::multibyte_read(bin_file); - - while(len > 0) { - UString name = Compression::string_read(bin_file); - transducers[name].read(bin_file); - - len--; - } - } - - return make_pair(make_pair(new_alphabet, letters), transducers); -} - std::pair, std::map > trim(FILE *file_mono, FILE *file_bi) { - std::pair, std::map > alph_trans_mono = read_fst(file_mono); - Alphabet alph_mono = alph_trans_mono.first.first; - std::map trans_mono = alph_trans_mono.second; - std::pair, std::map > alph_trans_bi = read_fst(file_bi); - Alphabet alph_bi = alph_trans_bi.first.first; - std::map trans_bi = alph_trans_bi.second; + UString letters_mono; + Alphabet alph_mono; + std::map trans_mono; + read_transducer_set(file_mono, letters_mono, alph_mono, trans_mono); + + UString letters_bi; + Alphabet alph_bi; + std::map trans_bi; + read_transducer_set(file_bi, letters_bi, alph_bi, trans_bi); // The prefix transducer is the union of all transducers from bidix, // with a ".*" appended @@ -146,15 +85,13 @@ trim(FILE *file_mono, FILE *file_bi) alph_mono, alph_prefix); - cout << it->first << " " << it->second.size(); - cout << " " << it->second.numberOfTransitions() << endl; if(it->second.numberOfTransitions() == 0) { - cerr << "Warning: empty section! Skipping it ..."<first << " is empty! Skipping it ..."<first].clear(); } else if(trimmed.hasNoFinals()) { - cerr << "Warning: section had no final state after trimming! Skipping it ..."<first << " had no final state after trimming! Skipping it ..."<first].clear(); } else { @@ -163,8 +100,7 @@ trim(FILE *file_mono, FILE *file_bi) } } - alph_trans_mono.second = trans_mono; - return alph_trans_mono; + return make_pair(make_pair(alph_mono, letters_mono), trans_mono); } @@ -195,22 +131,6 @@ int main(int argc, char *argv[]) UString letters = trimmed.first.second; std::map trans_t = trimmed.second; - int n_transducers = 0; - for(auto& it : trans_t) { - if(!(it.second.isEmpty())) - { - n_transducers++; - } - } - - if(n_transducers == 0) - { - cerr << "Error: Trimming gave empty transducer!" << endl; - cerr << "Hint: There are no words in bilingual dictionary that match " - "words in both monolingual dictionaries?" << endl; - exit(EXIT_FAILURE); - } - // Write the file: FILE *output = fopen(argv[3], "wb"); if(!output) @@ -219,20 +139,13 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - // letters - Compression::string_write(letters, output); - - // symbols - alph_t.write(output); + int n_trans = write_transducer_set(output, letters, alph_t, trans_t, true); - // transducers - Compression::multibyte_write(n_transducers, output); - for(auto& it : trans_t) { - if(!(it.second.isEmpty())) - { - Compression::string_write(it.first, output); - it.second.write(output); - } + if (n_trans == 0) { + cerr << "Error: Trimming gave empty transducer!" << endl; + cerr << "Hint: There are no words in bilingual dictionary that match " + "words in both monolingual dictionaries?" << endl; + exit(EXIT_FAILURE); } fclose(analyser); diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 4bc3d647..15d81f74 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -21,7 +21,7 @@ #include StringRef -StringWriter::add(const UString& s) +StringWriter::add(UString_view s) { auto start = buffer.find(s); if (start == UString::npos) { diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 15fcaf38..12000a47 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -30,7 +30,7 @@ struct StringRef { class StringWriter { public: UString buffer; - StringRef add(const UString& s); + StringRef add(UString_view s); UString_view get(const uint32_t start, const uint32_t count); UString_view get(const StringRef& ref); void read(FILE* in); diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index d783c283..980c0246 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -1374,3 +1374,114 @@ Transducer::intersect(Transducer &trimmer, // (instead of exiting the whole program) if no finals. return trimmed; } + +void +read_transducer_set(FILE* input, UString& letters, Alphabet& alpha, + map& trans) +{ + fpos_t pos; + bool mmap = false; + if (fgetpos(input, &pos) == 0) { + char header[4]{}; + auto r = fread_unlocked(header, 1, 4, input); + if (r == 4 && strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { + auto features = read_le_64(input); + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); + } + mmap = features & LTF_MMAP; + } + else { + // Old binary format + fsetpos(input, &pos); + } + } + + if (mmap) { + // make copies of all the strings we get from StringWriter + // because it gets deallocated when the function returns + StringWriter sw; + sw.read(input); + + // letters + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + letters = UString{sw.get(s, c)}; + + // symbols + alpha.read_mmap(input, sw); + + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{sw.get(s, c)}; + trans[name].read_mmap(input, alpha); + } + } else { + // letters + letters = Compression::string_read(input); + + // symbols + alpha.read(input); + + int len = Compression::multibyte_read(input); + + while(len > 0) { + UString name = Compression::string_read(input); + trans[name].read(input); + + len--; + } + } +} + +uint64_t +write_transducer_set(FILE* output, UString_view letters, Alphabet& alpha, + map& trans, + bool skip_empty) +{ + fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); + uint64_t features = 0; + features |= LTF_MMAP; + write_le_64(output, features); + + uint64_t transducer_count = trans.size(); + + StringWriter sw; + StringRef letter_loc = sw.add(letters); + for (auto& it : alpha.getTags()) { + sw.add(it); + } + for (auto& it : trans) { + if (skip_empty && it.second.isEmpty()) { + transducer_count--; + continue; + } + sw.add(it.first); + } + sw.write(output); + + // letters + write_le_32(output, letter_loc.start); + write_le_32(output, letter_loc.count); + + // symbols + alpha.write_mmap(output, sw); + + // transducers + write_le_64(output, transducer_count); + for (auto& it : trans) { + if (skip_empty && it.second.isEmpty()) { + continue; + } + cout << it.first << " " << it.second.size(); + cout << " " << it.second.numberOfTransitions() << endl; + StringRef loc = sw.add(it.first); + write_le_32(output, loc.start); + write_le_32(output, loc.count); + it.second.write_mmap(output, alpha); + } + + return transducer_count; +} diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 89b0d9f9..394b6831 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -421,4 +421,11 @@ class Transducer }; +void read_transducer_set(FILE* input, UString& letters, Alphabet& alpha, + map& trans); +uint64_t write_transducer_set(FILE* output, + UString_view letters, Alphabet& alpha, + map& trans, + bool skip_empty=false); + #endif From dad61a585e49519942a8111888beb11283cebf76 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 16:46:56 -0500 Subject: [PATCH 17/35] actually mmap --- lttoolbox/Makefile.am | 2 +- lttoolbox/alphabet_exe.cc | 16 +++++++++- lttoolbox/alphabet_exe.h | 2 ++ lttoolbox/endian_util.h | 64 +++++++++++++++++++++---------------- lttoolbox/fst_processor.cc | 62 +++++++++++++++++++++++++++-------- lttoolbox/fst_processor.h | 5 +++ lttoolbox/match_state.cc | 1 - lttoolbox/mmap.h | 34 ++++++++++++++++++++ lttoolbox/string_writer.cc | 47 +++++++++++++++++---------- lttoolbox/string_writer.h | 7 +++- lttoolbox/transducer.cc | 15 +++++---- lttoolbox/transducer_exe.cc | 37 ++++++++++++++++++--- lttoolbox/transducer_exe.h | 2 ++ 13 files changed, 220 insertions(+), 74 deletions(-) create mode 100644 lttoolbox/mmap.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index ec07213a..f3737b40 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,7 +1,7 @@ h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 5f438651..0c682f0f 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -26,7 +26,9 @@ AlphabetExe::AlphabetExe(StringWriter* sw_) AlphabetExe::~AlphabetExe() { - delete[] tags; + if (!mmapping) { + delete[] tags; + } } void @@ -63,6 +65,18 @@ AlphabetExe::read(FILE* input, bool mmap) } } +void* +AlphabetExe::init(void* ptr) +{ + mmapping = true; + tag_count = from_le_64(reinterpret_cast(ptr)[0]); + tags = reinterpret_cast(ptr + sizeof(uint64_t)); + for (uint64_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + return ptr + sizeof(uint64_t) + tag_count*sizeof(StringRef); +} + int32_t AlphabetExe::operator()(UString_view sv) { diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index af579bb8..29dcdbe5 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -27,10 +27,12 @@ class AlphabetExe { uint64_t tag_count; StringRef* tags; std::map symbol_map; + bool mmapping = false; public: AlphabetExe(StringWriter* sw_); ~AlphabetExe(); void read(FILE* in, bool mmap); + void* init(void* ptr); int32_t operator()(UString_view sv); void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; bool isTag(const int32_t symbol) const; diff --git a/lttoolbox/endian_util.h b/lttoolbox/endian_util.h index 069bd74f..fadbbacc 100644 --- a/lttoolbox/endian_util.h +++ b/lttoolbox/endian_util.h @@ -23,40 +23,48 @@ #include #include -inline uint32_t to_le_32(uint32_t v) { - return (((v & 0xFF) << 24) | - ((v & 0xFF00) << 8) | - ((v & 0xFF0000) >> 8) | - ((v & 0xFF000000) >> 24)); +inline uint32_t to_le_32(uint32_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + bytes[3] = (v >> 24) & 0xFF; + bytes[2] = (v >> 16) & 0xFF; + bytes[1] = (v >> 8) & 0xFF; + bytes[0] = v & 0xFF; + return v; } -inline uint32_t from_le_32(uint32_t v) { - return (((v & 0xFF000000) >> 24) | - ((v & 0xFF0000) >> 8) | - ((v & 0xFF00) << 8) | - ((v & 0xFF) << 24)); +inline uint32_t from_le_32(uint32_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + v = ((bytes[3] << 24) | + (bytes[2] << 16) | + (bytes[1] << 8) | + bytes[0]); + return v; } -inline uint64_t to_le_64(uint64_t v) { - return (((v & 0xFF) << 56) | - ((v & 0xFF00) << 40) | - ((v & 0xFF0000) << 24) | - ((v & 0xFF000000) << 8) | - ((v & 0xFF00000000) >> 8) | - ((v & 0xFF0000000000) >> 24) | - ((v & 0xFF000000000000) >> 40) | - ((v & 0xFF00000000000000) >> 56)); +inline uint64_t to_le_64(uint64_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + bytes[7] = (v >> 56) & 0xFF; + bytes[6] = (v >> 48) & 0xFF; + bytes[5] = (v >> 40) & 0xFF; + bytes[4] = (v >> 32) & 0xFF; + bytes[3] = (v >> 24) & 0xFF; + bytes[2] = (v >> 16) & 0xFF; + bytes[1] = (v >> 8) & 0xFF; + bytes[0] = v & 0xFF; + return v; } -inline uint64_t from_le_64(uint64_t v) { - return (((v & 0xFF00000000000000) >> 56) | - ((v & 0xFF000000000000) >> 40) | - ((v & 0xFF0000000000) >> 24) | - ((v & 0xFF00000000) >> 8) | - ((v & 0xFF000000) << 8) | - ((v & 0xFF0000) << 24) | - ((v & 0xFF00) << 40) | - ((v & 0xFF) << 56)); +inline uint64_t from_le_64(uint64_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + v = ((static_cast(bytes[7]) << 56ull) | + (static_cast(bytes[6]) << 48ull) | + (static_cast(bytes[5]) << 40ull) | + (static_cast(bytes[4]) << 32ull) | + (static_cast(bytes[3]) << 24ull) | + (static_cast(bytes[2]) << 16ull) | + (static_cast(bytes[1]) << 8ull) | + (static_cast(bytes[0]))); + return v; } inline auto write_le_32(FILE* out, uint32_t value) { diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 1bb7a92a..3b675c89 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,13 @@ FSTProcessor::FSTProcessor() } } +FSTProcessor::~FSTProcessor() +{ + if (mmapping) { + munmap(mmap_pointer, mmap_len); + } +} + void FSTProcessor::streamError() { @@ -935,7 +943,7 @@ FSTProcessor::load(FILE *input) char header[4]{}; fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(input); + auto features = read_le_64(input); if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } @@ -948,24 +956,50 @@ FSTProcessor::load(FILE *input) } if (mmap) { - str_write.read(input); + fgetpos(input, &pos); + rewind(input); + mmapping = mmap_file(input, mmap_pointer, mmap_len); + if (mmapping) { + void* ptr = mmap_pointer + 12; + ptr = str_write.init(ptr); - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - vector vec; - ustring_to_vec32(str_write.get(s, c), vec); - alphabetic_chars.insert(vec.begin(), vec.end()); - // alphabetic_chars + StringRef let_loc = reinterpret_cast(ptr)[0]; + vector vec; + ustring_to_vec32(str_write.get(let_loc), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); + ptr += sizeof(StringRef); - alphabet.read(input, true); + ptr = alphabet.init(ptr); + + uint64_t tr_count = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + for (uint64_t i = 0; i < tr_count; i++) { + StringRef tn = reinterpret_cast(ptr)[0]; + ptr += sizeof(StringRef); + UString name = UString{str_write.get(tn)}; + ptr = transducers[name].init(ptr); + } + } else { + fsetpos(input, &pos); + + str_write.read(input); - uint64_t tr_count = read_le_64(input); - Alphabet temp; - for (uint64_t i = 0; i < tr_count; i++) { uint32_t s = read_le_32(input); uint32_t c = read_le_32(input); - UString name = UString{str_write.get(s, c)}; - transducers[name].read(input, temp); + vector vec; + ustring_to_vec32(str_write.get(s, c), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); + + alphabet.read(input, true); + + uint64_t tr_count = read_le_64(input); + Alphabet temp; + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{str_write.get(s, c)}; + transducers[name].read(input, temp); + } } } else { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 412bba22..7951b76c 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -259,6 +259,10 @@ class FSTProcessor */ int maxWeightClasses = INT_MAX; + bool mmapping = false; + void* mmap_pointer = nullptr; + int mmap_len = 0; + /** * Prints an error of input stream and exits */ @@ -498,6 +502,7 @@ class FSTProcessor static UString const WBLANK_FINAL; FSTProcessor(); + ~FSTProcessor(); void initAnalysis(); void initTMAnalysis(); diff --git a/lttoolbox/match_state.cc b/lttoolbox/match_state.cc index 84f4aef3..75e3ad7a 100644 --- a/lttoolbox/match_state.cc +++ b/lttoolbox/match_state.cc @@ -15,7 +15,6 @@ * along with this program; if not, see . */ #include -#include #include #include diff --git a/lttoolbox/mmap.h b/lttoolbox/mmap.h new file mode 100644 index 00000000..221f57b5 --- /dev/null +++ b/lttoolbox/mmap.h @@ -0,0 +1,34 @@ +#ifndef _LT_MMAP_ +#define _LT_MMAP_ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline bool mmap_file(FILE* fd, void*& ptr, int& len) +{ + std::cerr << "mmap_file()\n"; + struct stat sb; + if (fstat(fileno(fd), &sb) == -1) { + std::cerr << "fstat failed\n"; + return false; + } + len = sb.st_size; + std::cerr << "file length is " << len << "\n"; + ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fileno(fd), 0); + if (ptr == MAP_FAILED) { + std::cerr << "mmap failed\nerrno = " << errno << "\n"; + return false; + } + std::cerr << "got pointer\n"; + return true; +} + +#endif diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 15d81f74..bb4d6eb6 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -23,10 +23,10 @@ StringRef StringWriter::add(UString_view s) { - auto start = buffer.find(s); + auto start = edit_buffer.find(s); if (start == UString::npos) { - start = buffer.size(); - buffer += s; + start = edit_buffer.size(); + edit_buffer += s; } StringRef ret; ret.start = start; @@ -37,43 +37,58 @@ StringWriter::add(UString_view s) UString_view StringWriter::get(const uint32_t start, const uint32_t count) { - UString_view ret(buffer); - return ret.substr(start, count); + if (mmapping) { + UString_view ret(mmap_buffer, mmap_size); + return ret.substr(start, count); + } else { + UString_view ret(edit_buffer); + return ret.substr(start, count); + } } UString_view StringWriter::get(const StringRef& ref) { - UString_view ret(buffer); - return ret.substr(ref.start, ref.count); + return get(ref.start, ref.count); } void StringWriter::read(FILE* in) { uint64_t len = read_le_64(in); - buffer.clear(); - buffer.reserve(len); + edit_buffer.clear(); + edit_buffer.reserve(len); uint8_t temp[len*2]{}; if (fread_unlocked(&temp, 1, len*2, in) != len*2) { throw std::runtime_error("Failed to read strings"); } uint16_t c; for (uint64_t i = 0; i < len*2; i += 2) { - buffer += static_cast(temp[i] | (temp[i+1] << 8)); + edit_buffer += static_cast(temp[i] | (temp[i+1] << 8)); } } void StringWriter::write(FILE* out) { - write_le_64(out, buffer.size()); - uint8_t temp[buffer.size()*2]{}; - for (uint64_t i = 0; i < buffer.size(); i++) { - temp[2*i] = buffer[i] & 0xFF; - temp[2*i+1] = (buffer[i] >> 8) & 0xFF; + write_le_64(out, edit_buffer.size()); + uint8_t temp[edit_buffer.size()*2]{}; + for (uint64_t i = 0; i < edit_buffer.size(); i++) { + temp[2*i] = edit_buffer[i] & 0xFF; + temp[2*i+1] = (edit_buffer[i] >> 8) & 0xFF; } - if (fwrite_unlocked(&temp, 1, buffer.size()*2, out) != buffer.size()*2) { + if (fwrite_unlocked(&temp, 1, edit_buffer.size()*2, out) != edit_buffer.size()*2) { throw std::runtime_error("Failed to write strings"); } } + +void* +StringWriter::init(void* ptr) +{ + mmapping = true; + mmap_size = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + mmap_buffer = reinterpret_cast(ptr); + get(0, mmap_size); + return ptr + sizeof(UChar)*mmap_size; +} diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 12000a47..182180f1 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -28,13 +28,18 @@ struct StringRef { }; class StringWriter { +private: + bool mmapping = false; + UString edit_buffer; + uint64_t mmap_size; + UChar* mmap_buffer; public: - UString buffer; StringRef add(UString_view s); UString_view get(const uint32_t start, const uint32_t count); UString_view get(const StringRef& ref); void read(FILE* in); void write(FILE* out); + void* init(void* ptr); }; #endif diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 980c0246..4b52ccfe 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -691,12 +691,12 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) } vector offsets; - offsets.reserve(state_count); + offsets.reserve(state_count+1); for (uint64_t i = 0; i < state_count; i++) { transitions[i].clear(); offsets.push_back(read_le_64(in)); } - offsets.push_back(0); + offsets.push_back(read_le_64(in)); uint64_t state = 0; for (uint64_t i = 0; i < trans_count; i++) { @@ -723,19 +723,20 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) uint64_t tr_count = 0; vector offsets; - offsets.reserve(transitions.size()); + offsets.reserve(transitions.size()+1); for (auto& it : transitions) { offsets.push_back(tr_count); tr_count += it.second.size(); } + offsets.push_back(tr_count); // TODO: which things should be smaller than u64? uint64_t total_size = - ( transitions.size() + // offset of each state - (tr_count * 4) + // each transition - (finals.size() * 2) + // final states - 4 ); // initial state + length of each section + ( transitions.size() + 1 + // offset of each state + (tr_count * 3) + // each transition + (finals.size() * 2) + // final states + 4 ); // initial state + length of each section write_le_64(out, total_size*8); // number of bytes after this write_le_64(out, initial); // initial state diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index fd69922a..3b6fe602 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -32,9 +32,11 @@ TransducerExe::TransducerExe() : TransducerExe::~TransducerExe() { - delete[] finals; - delete[] offsets; - delete[] transitions; + if (!mmapping) { + delete[] finals; + delete[] offsets; + delete[] transitions; + } } void @@ -72,10 +74,9 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } offsets = new uint64_t[state_count+1]; - for (uint64_t i = 0; i < state_count; i++) { + for (uint64_t i = 0; i < state_count+1; i++) { offsets[i] = read_le_64(input); } - offsets[state_count] = transition_count; transitions = new Transition[transition_count]; for (uint64_t i = 0; i < transition_count; i++) { @@ -144,6 +145,32 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } } +void* +TransducerExe::init(void* ptr) +{ + mmapping = true; + + ptr += 4 + sizeof(uint64_t); // skip header + uint64_t* arr = reinterpret_cast(ptr); + uint64_t total_size = arr[0]; + initial = arr[1]; + state_count = arr[2]; + final_count = arr[3]; + transition_count = arr[4]; + ptr += sizeof(uint64_t)*5; + + finals = reinterpret_cast(ptr); + ptr += sizeof(Final)*final_count; + + offsets = reinterpret_cast(ptr); + ptr += sizeof(uint64_t)*(state_count+1); + + transitions = reinterpret_cast(ptr); + ptr += sizeof(Transition)*transition_count; + + return ptr; +} + void TransducerExe::get_range(const uint64_t state, const int32_t symbol, uint64_t& start, uint64_t& end) diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index eadf8942..cef9a306 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -50,6 +50,7 @@ class TransducerExe { Final* finals; uint64_t* offsets; Transition* transitions; + bool mmapping = false; void get_range(const uint64_t state, const int32_t sym, uint64_t& start, uint64_t& end); @@ -59,6 +60,7 @@ class TransducerExe { TransducerExe(); ~TransducerExe(); void read(FILE* input, Alphabet& alphabet); + void* init(void* ptr); }; #endif From a4b2962bc6fcc811eed1a192c826084dc04a4be2 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 16:50:15 -0500 Subject: [PATCH 18/35] missed some debug statement --- lttoolbox/mmap.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lttoolbox/mmap.h b/lttoolbox/mmap.h index 221f57b5..5b0b1dae 100644 --- a/lttoolbox/mmap.h +++ b/lttoolbox/mmap.h @@ -8,26 +8,22 @@ #include #include -#include - -#include +//DEBUG +//#include +//#include inline bool mmap_file(FILE* fd, void*& ptr, int& len) { - std::cerr << "mmap_file()\n"; struct stat sb; if (fstat(fileno(fd), &sb) == -1) { - std::cerr << "fstat failed\n"; return false; } len = sb.st_size; - std::cerr << "file length is " << len << "\n"; ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fileno(fd), 0); if (ptr == MAP_FAILED) { - std::cerr << "mmap failed\nerrno = " << errno << "\n"; + //std::cerr << "mmap failed\nerrno = " << errno << "\n"; return false; } - std::cerr << "got pointer\n"; return true; } From ccb9ab7b91c8eba77c519effc59d9e1296d546ef Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 17:36:36 -0500 Subject: [PATCH 19/35] split read functions for TransducerExe --- lttoolbox/fst_processor.cc | 5 +- lttoolbox/transducer_exe.cc | 165 +++++++++++++++++++----------------- lttoolbox/transducer_exe.h | 3 +- 3 files changed, 93 insertions(+), 80 deletions(-) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 3b675c89..cb8c7ce1 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -993,12 +993,11 @@ FSTProcessor::load(FILE *input) alphabet.read(input, true); uint64_t tr_count = read_le_64(input); - Alphabet temp; for (uint64_t i = 0; i < tr_count; i++) { uint32_t s = read_le_32(input); uint32_t c = read_le_32(input); UString name = UString{str_write.get(s, c)}; - transducers[name].read(input, temp); + transducers[name].read(input); } } } else { @@ -1021,7 +1020,7 @@ FSTProcessor::load(FILE *input) while(len > 0) { UString name = Compression::string_read(input); - transducers[name].read(input, temp); + transducers[name].read_compressed(input, temp); len--; } } diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 3b6fe602..259f71e2 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -40,10 +40,9 @@ TransducerExe::~TransducerExe() } void -TransducerExe::read(FILE* input, Alphabet& alphabet) +TransducerExe::read_compressed(FILE* input, Alphabet& alphabet) { bool read_weights = false; // only matters for pre-mmap - bool mmap = false; fpos_t pos; fgetpos(input, &pos); char header[4]{}; @@ -54,94 +53,108 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); } read_weights = (features & TDF_WEIGHTS); - mmap = (features & TDF_MMAP); } else { // no header fsetpos(input, &pos); } - if (mmap) { - read_le_64(input); // total size - initial = read_le_64(input); - state_count = read_le_64(input); - final_count = read_le_64(input); - transition_count = read_le_64(input); - - finals = new Final[final_count]; - for (uint64_t i = 0; i < final_count; i++) { - finals[i].state = read_le_64(input); - finals[i].weight = read_le_double(input); - } + initial = Compression::multibyte_read(input); + final_count = Compression::multibyte_read(input); - offsets = new uint64_t[state_count+1]; - for (uint64_t i = 0; i < state_count+1; i++) { - offsets[i] = read_le_64(input); + uint64_t base_state = 0; + double base_weight = 0.0; + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + base_state += Compression::multibyte_read(input); + if (read_weights) { + base_weight += Compression::long_multibyte_read(input); } + finals[i].state = base_state; + finals[i].weight = base_weight; + } - transitions = new Transition[transition_count]; - for (uint64_t i = 0; i < transition_count; i++) { - transitions[i].isym = read_le_s32(input); - transitions[i].osym = read_le_s32(input); - transitions[i].dest = read_le_64(input); - transitions[i].weight = read_le_double(input); - } - } else { - initial = Compression::multibyte_read(input); - final_count = Compression::multibyte_read(input); - - uint64_t base_state = 0; - double base_weight = 0.0; - finals = new Final[final_count]; - for (uint64_t i = 0; i < final_count; i++) { - base_state += Compression::multibyte_read(input); + state_count = Compression::multibyte_read(input); + offsets = new uint64_t[state_count+1]; + transition_count = 0; + std::vector isyms, osyms; + std::vector dests; + std::vector weights; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = transition_count; + std::map>>> temp; + uint64_t count = Compression::multibyte_read(input); + transition_count += count; + int32_t tag_base = 0; + for (uint64_t t = 0; t < count; t++) { + tag_base += Compression::multibyte_read(input); + uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; if (read_weights) { - base_weight += Compression::long_multibyte_read(input); + base_weight = Compression::long_multibyte_read(input); } - finals[i].state = base_state; - finals[i].weight = base_weight; + auto sym = alphabet.decode(tag_base); + temp[sym.first].push_back(make_pair(sym.second, + make_pair(dest, base_weight))); } - - state_count = Compression::multibyte_read(input); - offsets = new uint64_t[state_count+1]; - transition_count = 0; - std::vector isyms, osyms; - std::vector dests; - std::vector weights; - for (uint64_t i = 0; i < state_count; i++) { - offsets[i] = transition_count; - std::map>>> temp; - uint64_t count = Compression::multibyte_read(input); - transition_count += count; - int32_t tag_base = 0; - for (uint64_t t = 0; t < count; t++) { - tag_base += Compression::multibyte_read(input); - uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; - if (read_weights) { - base_weight = Compression::long_multibyte_read(input); - } - auto sym = alphabet.decode(tag_base); - temp[sym.first].push_back(make_pair(sym.second, - make_pair(dest, base_weight))); - } - for (auto& it : temp) { - for (auto& it2 : it.second) { - isyms.push_back(it.first); - osyms.push_back(it2.first); - dests.push_back(it2.second.first); - weights.push_back(it2.second.second); - } + for (auto& it : temp) { + for (auto& it2 : it.second) { + isyms.push_back(it.first); + osyms.push_back(it2.first); + dests.push_back(it2.second.first); + weights.push_back(it2.second.second); } } - offsets[state_count] = transition_count; - transitions = new Transition[transition_count]; - for (uint64_t i = 0; i < transition_count; i++) { - transitions[i].isym = isyms[i]; - transitions[i].osym = osyms[i]; - transitions[i].dest = dests[i]; - transitions[i].weight = weights[i]; + } + offsets[state_count] = transition_count; + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = isyms[i]; + transitions[i].osym = osyms[i]; + transitions[i].dest = dests[i]; + transitions[i].weight = weights[i]; + } +} + +void +TransducerExe::read(FILE* input) +{ + fpos_t pos; + fgetpos(input, &pos); + char header[4]{}; + auto l = fread_unlocked(header, 1, 4, input); + if (l == 4 && strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = read_le_64(input); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); } + } else { + throw std::runtime_error("Unable to read transducer header!"); + } + + read_le_64(input); // total size + initial = read_le_64(input); + state_count = read_le_64(input); + final_count = read_le_64(input); + transition_count = read_le_64(input); + + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + finals[i].state = read_le_64(input); + finals[i].weight = read_le_double(input); + } + + offsets = new uint64_t[state_count+1]; + for (uint64_t i = 0; i < state_count+1; i++) { + offsets[i] = read_le_64(input); + } + + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = read_le_s32(input); + transitions[i].osym = read_le_s32(input); + transitions[i].dest = read_le_64(input); + transitions[i].weight = read_le_double(input); } } diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index cef9a306..6b7d93b6 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -59,7 +59,8 @@ class TransducerExe { public: TransducerExe(); ~TransducerExe(); - void read(FILE* input, Alphabet& alphabet); + void read_compressed(FILE* input, Alphabet& alphabet); + void read(FILE* input); void* init(void* ptr); }; From f9d54e1890ebf9edc73f1f2db3f07e1535fc5cf0 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 2 Aug 2021 20:19:11 -0500 Subject: [PATCH 20/35] lsx-proc needs to add symbols at runtime, so support that --- lttoolbox/alphabet_exe.cc | 39 ++++++++++++++++++++++++++++++++++++++- lttoolbox/alphabet_exe.h | 5 +++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 0c682f0f..542a6663 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -94,7 +94,12 @@ AlphabetExe::getSymbol(UString& result, int32_t symbol, bool uppercase) const if (symbol == 0) { return; } else if (symbol < 0) { - result.append(sw->get(tags[-symbol-1])); + int idx = -symbol-1; + if (idx < tag_count) { + result.append(sw->get(tags[idx])); + } else { + result.append(dynamic_symbols[idx-tag_count]); + } } else if (uppercase) { result += u_toupper(static_cast(symbol)); } else { @@ -116,3 +121,35 @@ AlphabetExe::clearSymbol(const int32_t symbol) tags[-symbol-1].count = 0; } } + +int32_t +AlphabetExe::lookupDynamic(const UString& symbol) +{ + int32_t ret; + auto it = symbol_map.find(symbol); + if (it == symbol_map.end()) { + if (dynamic_symbols.empty()) { + // should be able to usually avoid reindexing with this + dynamic_symbols.reserve(32); + } + ret = -tag_count -dynamic_symbols.size() -1; + bool rebuild = (dynamic_symbols.size() == dynamic_symbols.capacity()); + dynamic_symbols.push_back(symbol); + symbol_map[dynamic_symbols.back()] = ret; + if (rebuild) { + // moderately horrible, but that's what we get for invalidating + // all the views when dynamic_symbols gets reallocated + symbol_map.clear(); + for (uint64_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + int32_t n = -tag_count-1; + for (auto& ds : dynamic_symbols) { + symbol_map[ds] = n--; + } + } + } else { + ret = it->second; + } + return ret; +} diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index 29dcdbe5..52ccdb95 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -20,6 +20,7 @@ #include #include +#include class AlphabetExe { private: @@ -28,6 +29,8 @@ class AlphabetExe { StringRef* tags; std::map symbol_map; bool mmapping = false; + // tags added at runtime - used by apertium-separable + std::vector dynamic_symbols; public: AlphabetExe(StringWriter* sw_); ~AlphabetExe(); @@ -37,6 +40,8 @@ class AlphabetExe { void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; bool isTag(const int32_t symbol) const; void clearSymbol(const int32_t symbol); + // like operator() but add symbol to dynamic_symbols if not found + int32_t lookupDynamic(const UString& symbol); }; #endif From 66400c6fcff045f57056d59407c469ae51aa2648 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 3 Aug 2021 15:53:42 -0500 Subject: [PATCH 21/35] pass along -DHAVE_STRING_VIEW to other repos --- configure.ac | 5 +++++ lttoolbox.pc.in | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 96baab2e..b46971fc 100644 --- a/configure.ac +++ b/configure.ac @@ -47,6 +47,11 @@ AC_CHECK_LIB(xml2, xmlReaderForFile) # Checks for header files. AC_HEADER_STDC AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h string_view]) + +have_sv="" +AC_CHECK_HEADERS([string_view], [have_sv="-DHAVE_STRING_VIEW"], [have_sv=""]) +AC_SUBST([have_sv]) + AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) # Checks for typedefs, structures, and compiler characteristics. diff --git a/lttoolbox.pc.in b/lttoolbox.pc.in index 9ecf8f51..7b435232 100644 --- a/lttoolbox.pc.in +++ b/lttoolbox.pc.in @@ -7,4 +7,4 @@ Name: lttoolbox Description: Augmented letter transducer tools for natural language processing Version: @VERSION@ Libs: -L${libdir} -llttoolbox@VERSION_MAJOR@ -Cflags: -I${includedir}/lttoolbox-@VERSION_API@ +Cflags: -I${includedir}/lttoolbox-@VERSION_API@ @have_sv@ From 8517e219105b8f7c0cb421543f755cc4c7c31934 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 4 Aug 2021 17:37:33 -0500 Subject: [PATCH 22/35] yet more helper functions --- lttoolbox/alphabet_exe.cc | 2 +- lttoolbox/alphabet_exe.h | 2 +- lttoolbox/match_state2.cc | 44 +++++++++++++++++++++++++++++++++++++-- lttoolbox/match_state2.h | 5 +++-- lttoolbox/string_utils.cc | 21 +++++++++++++++++++ lttoolbox/string_utils.h | 2 ++ 6 files changed, 70 insertions(+), 6 deletions(-) diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 542a6663..40d27e5e 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -78,7 +78,7 @@ AlphabetExe::init(void* ptr) } int32_t -AlphabetExe::operator()(UString_view sv) +AlphabetExe::operator()(UString_view sv) const { auto it = symbol_map.find(sv); if (it != symbol_map.end()) { diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index 52ccdb95..e57a1c81 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -36,7 +36,7 @@ class AlphabetExe { ~AlphabetExe(); void read(FILE* in, bool mmap); void* init(void* ptr); - int32_t operator()(UString_view sv); + int32_t operator()(UString_view sv) const; void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; bool isTag(const int32_t symbol) const; void clearSymbol(const int32_t symbol); diff --git a/lttoolbox/match_state2.cc b/lttoolbox/match_state2.cc index 3738fd32..ddf25503 100644 --- a/lttoolbox/match_state2.cc +++ b/lttoolbox/match_state2.cc @@ -100,9 +100,49 @@ MatchState2::step(const int32_t input, const int32_t alt) } void -MatchState2::step(UString_view input, const Alphabet& alpha, bool foldcase) +MatchState2::step(const int32_t input, const int32_t alt1, int32_t alt2) { - // TODO + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + applySymbol(buffer[i], alt1); + applySymbol(buffer[i], alt2); + } + first = temp_last; +} + +void +MatchState2::step(UString_view input, const AlphabetExe& alpha, bool foldcase) +{ + int32_t any_char = alpha(""_u); + int32_t any_tag = alpha(""_u); + for (uint64_t i = 0; i < input.size(); i++) { + if (input[i] == '<') { + for (uint64_t j = i+1; j < input.size(); j++) { + if (input[j] == '\\') { + j++; + } else if (input[j] == '>') { + int32_t sym = alpha(input.substr(i, j-i+1)); + if (sym) { + step(sym, any_tag); + } else { + step(any_tag); + } + i = j; + break; + } + } + } else { + if (input[i] == '\\') { + i++; + } + if (foldcase && u_isupper(input[i])) { + step(input[i], u_tolower(input[i]), any_char); + } else { + step(input[i], any_char); + } + } + } } int diff --git a/lttoolbox/match_state2.h b/lttoolbox/match_state2.h index 1542c053..4b0ca285 100644 --- a/lttoolbox/match_state2.h +++ b/lttoolbox/match_state2.h @@ -18,7 +18,7 @@ #ifndef _LT_MATCH_STATE_ #define _LT_MATCH_STATE_ -#include +#include #include #include @@ -44,7 +44,8 @@ class MatchState2 bool empty() const; void step(const int32_t input); void step(const int32_t input, const int32_t alt); - void step(UString_view input, const Alphabet& alpha, bool foldcase = true); + void step(const int32_t input, const int32_t alt1, const int32_t alt2); + void step(UString_view input, const AlphabetExe& alpha, bool foldcase = true); int classifyFinals(const std::map& finals, const std::set& banned_rules) const; int classifyFinals(const std::map& finals) const; diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc index 411380da..4b57d55f 100644 --- a/lttoolbox/string_utils.cc +++ b/lttoolbox/string_utils.cc @@ -66,6 +66,27 @@ StringUtils::split(const UString& str, const UString& delim) return result; } +std::vector +StringUtils::split_escape(UString_view str, const UChar delim) +{ + std::vector ret; + size_t last = 0; + for (size_t i = 0; i < str.size(); i++) { + if (str[i] == '\\') { + i++; + } else if (str[i] == delim) { + if (i > last) { + ret.push_back(str.substr(last, i-last)); + } + last = i+1; + } + } + if (str.size() > last) { + ret.push_back(str.substr(last)); + } + return ret; +} + UString StringUtils::join(const std::vector& vec, const UString& delim) { diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h index 79aeadff..d8510f96 100644 --- a/lttoolbox/string_utils.h +++ b/lttoolbox/string_utils.h @@ -11,6 +11,8 @@ class StringUtils { // split string on delimiter static std::vector split(const UString& str, const UString& delim); + // split, but respect \ escapes + static std::vector split_escape(UString_view str, const UChar delim); // inverse of split static UString join(const std::vector& vec, const UString& delim); From 71553295b1c57a4ebd9bae23d35a0a871de6bfc3 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 5 Aug 2021 15:48:43 -0500 Subject: [PATCH 23/35] reading in offsets --- lttoolbox/transducer_exe.cc | 16 ++++++++++++---- lttoolbox/transducer_exe.h | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 259f71e2..4c37706b 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -40,7 +40,7 @@ TransducerExe::~TransducerExe() } void -TransducerExe::read_compressed(FILE* input, Alphabet& alphabet) +TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) { bool read_weights = false; // only matters for pre-mmap fpos_t pos; @@ -89,13 +89,21 @@ TransducerExe::read_compressed(FILE* input, Alphabet& alphabet) int32_t tag_base = 0; for (uint64_t t = 0; t < count; t++) { tag_base += Compression::multibyte_read(input); + if (match) { + tag_base -= alphabet.size(); + } uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; if (read_weights) { base_weight = Compression::long_multibyte_read(input); } - auto sym = alphabet.decode(tag_base); - temp[sym.first].push_back(make_pair(sym.second, - make_pair(dest, base_weight))); + if (match) { + temp[tag_base].push_back(make_pair(tag_base, + make_pair(dest, base_weight))); + } else { + auto sym = alphabet.decode(tag_base); + temp[sym.first].push_back(make_pair(sym.second, + make_pair(dest, base_weight))); + } } for (auto& it : temp) { for (auto& it2 : it.second) { diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index 6b7d93b6..665e4296 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -59,7 +59,7 @@ class TransducerExe { public: TransducerExe(); ~TransducerExe(); - void read_compressed(FILE* input, Alphabet& alphabet); + void read_compressed(FILE* input, Alphabet& alphabet, bool match = false); void read(FILE* input); void* init(void* ptr); }; From 17a257d3bce48df37fdbf0fcb6496478e585d5d5 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 7 Aug 2021 10:09:28 -0500 Subject: [PATCH 24/35] const lookup function for StringWriter --- lttoolbox/string_writer.cc | 20 ++++++++++++++++++++ lttoolbox/string_writer.h | 1 + 2 files changed, 21 insertions(+) diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index bb4d6eb6..1d76bbb9 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -34,6 +34,26 @@ StringWriter::add(UString_view s) return ret; } +StringRef +StringWriter::find(UString_view s) const +{ + StringRef ret; + ret.start = 0; + ret.count = 0; + UString_view buf; + if (mmapping) { + buf = UString_view(mmap_buffer, mmap_size); + } else { + buf = UString_view(edit_buffer); + } + auto start = buf.find(s); + if (start != UString_view::npos) { + ret.start = start; + ret.count = s.size(); + } + return ret; +} + UString_view StringWriter::get(const uint32_t start, const uint32_t count) { diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 182180f1..300c4fd6 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -35,6 +35,7 @@ class StringWriter { UChar* mmap_buffer; public: StringRef add(UString_view s); + StringRef find(UString_view s) const; UString_view get(const uint32_t start, const uint32_t count); UString_view get(const StringRef& ref); void read(FILE* in); From 55d00ac739129c6db5da40a748f8853bcb7c162b Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 21 Aug 2021 16:13:47 -0400 Subject: [PATCH 25/35] start dropping compression.h --- lttoolbox/Makefile.am | 6 +- lttoolbox/alphabet_exe.cc | 12 ++-- lttoolbox/binary_headers.h | 23 +++++++ lttoolbox/compression.h | 19 +----- lttoolbox/fst_processor.cc | 12 ++-- lttoolbox/old_binary.cc | 129 +++++++++++++++++++++++++++++++++++++ lttoolbox/old_binary.h | 16 +++++ 7 files changed, 185 insertions(+), 32 deletions(-) create mode 100644 lttoolbox/binary_headers.h create mode 100644 lttoolbox/old_binary.cc create mode 100644 lttoolbox/old_binary.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index f3737b40..fde33790 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,13 +1,13 @@ -h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \ +h_sources = alphabet.h alphabet_exe.h att_compiler.h binary_headers.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h old_binary.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ - match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \ + match_node.cc match_state.cc match_state2.cc node.cc old_binary.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 40d27e5e..81d26441 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -17,8 +17,8 @@ #include -#include #include +#include AlphabetExe::AlphabetExe(StringWriter* sw_) : sw(sw_), tag_count(0), tags(nullptr) @@ -43,12 +43,12 @@ AlphabetExe::read(FILE* input, bool mmap) symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; } } else { - tag_count = Compression::multibyte_read(input); + tag_count = OldBinary::read_int(input); tags = new StringRef[tag_count]; for (uint32_t i = 0; i < tag_count; i++) { UString tg; tg += '<'; - tg += Compression::string_read(input); + OldBinary::read_ustr(input, tg); tg += '>'; tags[i] = sw->add(tg); } @@ -57,10 +57,10 @@ AlphabetExe::read(FILE* input, bool mmap) for (uint32_t i = 0; i < tag_count; i++) { symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; } - int pairs = Compression::multibyte_read(input); + int pairs = OldBinary::read_int(input); for (int i = 0; i < pairs; i++) { - Compression::multibyte_read(input); - Compression::multibyte_read(input); + OldBinary::read_int(input); + OldBinary::read_int(input); } } } diff --git a/lttoolbox/binary_headers.h b/lttoolbox/binary_headers.h new file mode 100644 index 00000000..878c7bce --- /dev/null +++ b/lttoolbox/binary_headers.h @@ -0,0 +1,23 @@ +#ifndef _LT_BINARY_HEADERS_ +#define _LT_BINARY_HEADERS_ + +#include + +// Global lttoolbox features +constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; +enum LT_FEATURES : uint64_t { + LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format + LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added + LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +// Invididual transducer features +constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; +enum TD_FEATURES : uint64_t { + TDF_WEIGHTS = (1ull << 0), + TDF_MMAP = (1ull << 1), + TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added + TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +#endif \ No newline at end of file diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 5783f774..739290d4 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -23,27 +23,10 @@ #include #include #include +#include using namespace std; -// Global lttoolbox features -constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; -enum LT_FEATURES : uint64_t { - LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format - LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added - LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits -}; - -// Invididual transducer features -constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; -enum TD_FEATURES : uint64_t { - TDF_WEIGHTS = (1ull << 0), - TDF_MMAP = (1ull << 1), - TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added - TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits -}; - - inline auto write_u64(FILE *out, uint64_t value) { auto rv = fwrite_unlocked(reinterpret_cast(&value), 1, sizeof(value), out); if (rv != sizeof(value)) { diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index cb8c7ce1..267aeecd 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -15,10 +15,11 @@ * along with this program; if not, see . */ #include -#include +#include #include #include #include +#include #include #include @@ -1003,9 +1004,9 @@ FSTProcessor::load(FILE *input) } else { // letters - int len = Compression::multibyte_read(input); + uint64_t len = OldBinary::read_int(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(OldBinary::read_int(input))); len--; } @@ -1016,10 +1017,11 @@ FSTProcessor::load(FILE *input) Alphabet temp; temp.read(input); - len = Compression::multibyte_read(input); + len = OldBinary::read_int(input); while(len > 0) { - UString name = Compression::string_read(input); + UString name; + OldBinary::read_ustr(input, name); transducers[name].read_compressed(input, temp); len--; } diff --git a/lttoolbox/old_binary.cc b/lttoolbox/old_binary.cc new file mode 100644 index 00000000..a2d5e55b --- /dev/null +++ b/lttoolbox/old_binary.cc @@ -0,0 +1,129 @@ +#include +#include +#include + +using namespace OldBinary; + +uint64_t +OldBinary::read_u64(FILE* in) +{ + uint64_t v = 0; + if (fread_unlocked(reinterpret_cast(&v), 1, sizeof(v), in) != sizeof(v)) { + throw std::runtime_error("Failed to read uint64_t"); + } + // these are unconditional byte-swaps, so on little-endian platforms + // this reads big-endian data + // this is very bad, but it's the way all the old data was written, + // so we have this here for backwards compatibility until we drop + // support for lttoolbox/apertium <= 3 + // -DGS 2021-08-21 + return (((v & 0xFF00000000000000) >> 56) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF00) << 40) | + ((v & 0xFF) << 56)); +} + +uint64_t read_byte(FILE* in) +{ + unsigned char ret = 0; + if (fread_unlocked(&ret, 1, 1, in) != 1) { + throw std::runtime_error("Failed to read byte"); + } + return ret; +} + +uint64_t +OldBinary::read_int(FILE* in, bool compression) +{ + if (compression) { + uint64_t up = read_byte(in); + if (up < 0x40) { + return up; + } else if (up < 0x80) { + return ((up & 0x3f) << 8) | read_byte(in); + } else if (up < 0xc0) { + uint64_t ret = (up & 0x3f) << 8; + ret |= read_byte(in); + return (ret << 8) | read_byte(in); + } else { + uint64_t ret = ((up & 0x3f) << 8) | read_byte(in); + ret = (ret << 8) | read_byte(in); + ret = (ret << 8) | read_byte(in); + return ret; + } + } else { + uint64_t ret = 0; + uint64_t size = read_byte(in); + if (size > 8) { + throw std::runtime_error("can't deserialise int"); + } + uint8_t buffer[8]; + if (fread_unlocked(buffer, 1, size, in) != size) { + throw std::runtime_error("can't deserialise int"); + } + for (uint8_t i = 0; i < size; i++) { + ret += static_cast(buffer[i]) << (8 * (size - i - 1)); + } + return ret; + } +} + +void +OldBinary::read_ustr(FILE* in, UString& s, bool compression) +{ + uint64_t count = read_int(in, compression); + for (uint64_t i = 0; i < count; i++) { + s += static_cast(read_int(in, compression)); + } +} + +void +OldBinary::read_str(FILE* in, std::string& s, bool compression) +{ + uint64_t count = read_int(in, compression); + for (uint64_t i = 0; i < count; i++) { + s += static_cast(read_int(in, compression)); + } +} + +double +OldBinary::read_double(FILE* in, bool compression, bool endian_util) +{ + if (compression) { + if (endian_util) { + double retval; +#ifdef WORDS_BIGENDIAN + fread_unlocked(&retval, sizeof(double), 1, input); +#else + char *s = reinterpret_cast(&retval); + + for(int i = sizeof(double)-1; i != -1; i--) { + if(fread_unlocked(&(s[i]), 1, 1, in)==0) { + return 0; + } + } +#endif + return retval; + } else { + uint64_t mantissa = read_int(in, true); + if (mantissa >= 0x04000000) { + mantissa = ((mantissa & 0x03ffffff) << 26) | read_int(in, true); + } + + uint64_t exponent = read_int(in, true); + if (exponent >= 0x04000000) { + exponent = ((exponent & 0x03ffffff) << 26) | read_int(in, true); + } + + double v = static_cast(static_cast(mantissa)) / 0x40000000; + return ldexp(v, static_cast(exponent)); + } + } else { + uint64_t d = read_int(in, false); + return *reinterpret_cast(&d); + } +} diff --git a/lttoolbox/old_binary.h b/lttoolbox/old_binary.h new file mode 100644 index 00000000..b7986911 --- /dev/null +++ b/lttoolbox/old_binary.h @@ -0,0 +1,16 @@ +#ifndef _LT_OLD_BINARY_ +#define _LT_OLD_BINARY_ + +#include +#include +#include + +namespace OldBinary { + uint64_t read_u64(FILE* in); + uint64_t read_int(FILE* in, bool compression=true); + void read_ustr(FILE* in, UString& s, bool compression=true); + void read_str(FILE* in, std::string& s, bool compression=true); + double read_double(FILE* in, bool compression=true, bool endian_util=false); +}; + +#endif From 4aa03cb84009aebb6acb5b2ca98056f26579c686 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 21 Aug 2021 16:39:13 -0400 Subject: [PATCH 26/35] continuing to maintain odd (buggy?) behavior --- lttoolbox/old_binary.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lttoolbox/old_binary.cc b/lttoolbox/old_binary.cc index a2d5e55b..b33a436d 100644 --- a/lttoolbox/old_binary.cc +++ b/lttoolbox/old_binary.cc @@ -31,7 +31,8 @@ uint64_t read_byte(FILE* in) { unsigned char ret = 0; if (fread_unlocked(&ret, 1, 1, in) != 1) { - throw std::runtime_error("Failed to read byte"); + // for some reason things break if this is an error + //throw std::runtime_error("Failed to read byte"); } return ret; } From 6c0a7120ef6f85451964a47ab09670ce5a54c48d Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 31 Aug 2021 11:51:12 -0400 Subject: [PATCH 27/35] =?UTF-8?q?Compression=20=E2=86=92=20OldBinary,=20se?= =?UTF-8?q?rialiser=20readers=20for=20perceptron?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lttoolbox/alphabet.cc | 46 ++++++++++++++------ lttoolbox/alphabet.h | 2 + lttoolbox/alphabet_exe.cc | 18 ++++---- lttoolbox/alphabet_exe.h | 2 +- lttoolbox/transducer.cc | 47 +++++++++++++++------ lttoolbox/transducer.h | 2 + lttoolbox/transducer_exe.cc | 83 ++++++++++++++++++++++++++++++++----- lttoolbox/transducer_exe.h | 1 + 8 files changed, 157 insertions(+), 44 deletions(-) diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index d6ef296c..50629ebb 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -153,26 +154,21 @@ Alphabet::read(FILE *input) a_new.spair.clear(); // Reading of taglist - int32_t tam = Compression::multibyte_read(input); map tmp; - while(tam > 0) - { - tam--; - UString mytag = "<"_u; - mytag += Compression::string_read(input); - mytag += ">"_u; + for (uint64_t tam = OldBinary::read_int(input, true); tam > 0; tam--) { + UString mytag; + mytag += '<'; + OldBinary::read_ustr(input, mytag, true); + mytag += '>'; a_new.slexicinv.push_back(mytag); a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics } // Reading of pairlist size_t bias = a_new.slexicinv.size(); - tam = Compression::multibyte_read(input); - while(tam > 0) - { - tam--; - int32_t first = Compression::multibyte_read(input); - int32_t second = Compression::multibyte_read(input); + for (uint64_t tam = OldBinary::read_int(input, true); tam > 0; tam--) { + int32_t first = OldBinary::read_int(input, true); + int32_t second = OldBinary::read_int(input, true); pair tmp(first - bias, second - bias); int32_t spair_size = a_new.spair.size(); a_new.spair[tmp] = spair_size; @@ -230,6 +226,30 @@ Alphabet::deserialise(std::istream &serialised) } } +void +Alphabet::read_serialised(FILE* in) +{ + slexicinv.clear(); + slexic.clear(); + spairinv.clear(); + spair.clear(); + uint64_t len = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < len; i++) { + UString t; + OldBinary::read_ustr(in, t, false); + slexicinv.push_back(t); + slexic[t] = -(int)i - 1; + } + len = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < len; i++) { + int32_t a = OldBinary::read_int(in, false); + int32_t b = OldBinary::read_int(in, false); + auto p = make_pair(a, b); + spairinv.push_back(p); + spair[p] = i; + } +} + void Alphabet::writeSymbol(int32_t const symbol, UFILE *output) const { diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index f3140750..f00271c6 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -142,6 +142,8 @@ class Alphabet void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); + void read_serialised(FILE* in); + /** * Write a symbol enclosed by angle brackets in the output stream. * @param symbol symbol code. diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 81d26441..0ed9e5c7 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -32,7 +32,7 @@ AlphabetExe::~AlphabetExe() } void -AlphabetExe::read(FILE* input, bool mmap) +AlphabetExe::read(FILE* input, bool mmap, bool compressed) { if (mmap) { tag_count = read_le_64(input); @@ -43,13 +43,17 @@ AlphabetExe::read(FILE* input, bool mmap) symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; } } else { - tag_count = OldBinary::read_int(input); + tag_count = OldBinary::read_int(input, compressed); tags = new StringRef[tag_count]; for (uint32_t i = 0; i < tag_count; i++) { UString tg; - tg += '<'; - OldBinary::read_ustr(input, tg); - tg += '>'; + if (compressed) { + tg += '<'; + OldBinary::read_ustr(input, tg, compressed); + tg += '>'; + } else { + OldBinary::read_ustr(input, tg, compressed); + } tags[i] = sw->add(tg); } // has to be a separate loop, otherwise the string_views get @@ -59,8 +63,8 @@ AlphabetExe::read(FILE* input, bool mmap) } int pairs = OldBinary::read_int(input); for (int i = 0; i < pairs; i++) { - OldBinary::read_int(input); - OldBinary::read_int(input); + OldBinary::read_int(input, compressed); + OldBinary::read_int(input, compressed); } } } diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index e57a1c81..aceff2d3 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -34,7 +34,7 @@ class AlphabetExe { public: AlphabetExe(StringWriter* sw_); ~AlphabetExe(); - void read(FILE* in, bool mmap); + void read(FILE* in, bool mmap, bool compressed=true); void* init(void* ptr); int32_t operator()(UString_view sv) const; void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 4b52ccfe..2c5e4faa 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -611,8 +612,8 @@ Transducer::read(FILE *input, int const decalage) } } - new_t.initial = Compression::multibyte_read(input); - int finals_size = Compression::multibyte_read(input); + new_t.initial = OldBinary::read_int(input, true); + int finals_size = OldBinary::read_int(input, true); int base = 0; double base_weight = default_weight; @@ -620,29 +621,29 @@ Transducer::read(FILE *input, int const decalage) { finals_size--; - base += Compression::multibyte_read(input); + base += OldBinary::read_int(input, true); if(read_weights) { - base_weight = Compression::long_multibyte_read(input); + base_weight = OldBinary::read_double(input, true); } new_t.finals.insert(make_pair(base, base_weight)); } - base = Compression::multibyte_read(input); + base = OldBinary::read_int(input, true); int number_of_states = base; int current_state = 0; while(number_of_states > 0) { - int number_of_local_transitions = Compression::multibyte_read(input); + int number_of_local_transitions = OldBinary::read_int(input, true); int tagbase = 0; while(number_of_local_transitions > 0) { number_of_local_transitions--; - tagbase += Compression::multibyte_read(input) - decalage; - int state = (current_state + Compression::multibyte_read(input)) % base; + tagbase += OldBinary::read_int(input, true) - decalage; + int state = (current_state + OldBinary::read_int(input, true)) % base; if(read_weights) { - base_weight = Compression::long_multibyte_read(input); + base_weight = OldBinary::read_double(input, true); } if(new_t.transitions.find(state) == new_t.transitions.end()) { @@ -790,6 +791,27 @@ Transducer::deserialise(std::istream &serialised) transitions = Deserialiser > > >::deserialise(serialised); } +void +Transducer::read_serialised(FILE* in) +{ + initial = OldBinary::read_int(in, false); + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { + int s = OldBinary::read_int(in, false); + finals.insert(make_pair(s, OldBinary::read_double(in, false))); + } + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { + int src = OldBinary::read_int(in, false); + multimap > st; + for (uint64_t j = OldBinary::read_int(in, false); j > 0; j--) { + int sym = OldBinary::read_int(in, false); + int dest = OldBinary::read_int(in, false); + double w = OldBinary::read_double(in, false); + st.insert(make_pair(sym, make_pair(dest, w))); + } + transitions.insert(make_pair(src, st)); + } +} + void Transducer::copy(Transducer const &t) { @@ -1421,15 +1443,16 @@ read_transducer_set(FILE* input, UString& letters, Alphabet& alpha, } } else { // letters - letters = Compression::string_read(input); + OldBinary::read_ustr(input, letters, true); // symbols alpha.read(input); - int len = Compression::multibyte_read(input); + int len = OldBinary::read_int(input, true); while(len > 0) { - UString name = Compression::string_read(input); + UString name; + OldBinary::read_ustr(input, name, true); trans[name].read(input); len--; diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 394b6831..9fdb0536 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -346,6 +346,8 @@ class Transducer void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); + void read_serialised(FILE* in); + /** * Insert another transducer into this, unifying source and targets. * Does not minimize. diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 4c37706b..4f65f4be 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -18,10 +18,11 @@ #include #include +#include #include // includes needed for reading non-mmap files -#include +#include #include #include @@ -48,7 +49,7 @@ TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) char header[4]{}; fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { - auto features = read_le(input); + auto features = read_le_64(input); if (features >= TDF_UNKNOWN) { throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); } @@ -58,22 +59,22 @@ TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) fsetpos(input, &pos); } - initial = Compression::multibyte_read(input); - final_count = Compression::multibyte_read(input); + initial = OldBinary::read_int(input, true); + final_count = OldBinary::read_int(input, true); uint64_t base_state = 0; double base_weight = 0.0; finals = new Final[final_count]; for (uint64_t i = 0; i < final_count; i++) { - base_state += Compression::multibyte_read(input); + base_state += OldBinary::read_int(input, true); if (read_weights) { - base_weight += Compression::long_multibyte_read(input); + base_weight += OldBinary::read_double(input, true); } finals[i].state = base_state; finals[i].weight = base_weight; } - state_count = Compression::multibyte_read(input); + state_count = OldBinary::read_int(input, true); offsets = new uint64_t[state_count+1]; transition_count = 0; std::vector isyms, osyms; @@ -84,17 +85,17 @@ TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) std::map>>> temp; - uint64_t count = Compression::multibyte_read(input); + uint64_t count = OldBinary::read_int(input, true); transition_count += count; int32_t tag_base = 0; for (uint64_t t = 0; t < count; t++) { - tag_base += Compression::multibyte_read(input); + tag_base += OldBinary::read_int(input, true); if (match) { tag_base -= alphabet.size(); } - uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; + uint64_t dest = (i + OldBinary::read_int(input, true)) % state_count; if (read_weights) { - base_weight = Compression::long_multibyte_read(input); + base_weight = OldBinary::read_double(input, true); } if (match) { temp[tag_base].push_back(make_pair(tag_base, @@ -124,6 +125,66 @@ TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) } } +void +TransducerExe::read_serialised(FILE* input, Alphabet& alphabet, bool match) +{ + initial = OldBinary::read_int(input, false); + final_count = OldBinary::read_int(input, false); + + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + finals[i].state = OldBinary::read_int(input, false); + finals[i].weight = OldBinary::read_double(input, false); + } + + state_count = OldBinary::read_int(input, false); + offsets = new uint64_t[state_count+1]; + transition_count = 0; + std::vector isyms, osyms; + std::vector dests; + std::vector weights; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = transition_count; + std::map>>> temp; + OldBinary::read_int(input, false); // src state, should == i + uint64_t count = OldBinary::read_int(input, false); + transition_count += count; + for (uint64_t t = 0; t < count; t++) { + int32_t tag = OldBinary::read_int(input, false); + if (match) { + tag -= alphabet.size(); + } + uint64_t dest = OldBinary::read_int(input, false); + double weight = OldBinary::read_double(input, false); + if (match) { + temp[tag].push_back(make_pair(tag, make_pair(dest, weight))); + } else { + auto sym = alphabet.decode(tag); + temp[sym.first].push_back(make_pair(sym.second, + make_pair(dest, weight))); + } + } + for (auto& it : temp) { + for (auto& it2 : it.second) { + isyms.push_back(it.first); + osyms.push_back(it2.first); + dests.push_back(it2.second.first); + weights.push_back(it2.second.second); + } + } + } + offsets[state_count] = transition_count; + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = isyms[i]; + transitions[i].osym = osyms[i]; + transitions[i].dest = dests[i]; + transitions[i].weight = weights[i]; + } +} + void TransducerExe::read(FILE* input) { diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index 665e4296..4fa1d77a 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -60,6 +60,7 @@ class TransducerExe { TransducerExe(); ~TransducerExe(); void read_compressed(FILE* input, Alphabet& alphabet, bool match = false); + void read_serialised(FILE* input, Alphabet& alphabet, bool match = false); void read(FILE* input); void* init(void* ptr); }; From ed69eafed37f3abc4962fc928757f68c00468c31 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 4 Sep 2021 19:38:00 -0400 Subject: [PATCH 28/35] byteswaps in old headers and tags don't get adjusted by serialiser.h --- lttoolbox/transducer_exe.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 4f65f4be..87a95578 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -49,7 +49,7 @@ TransducerExe::read_compressed(FILE* input, Alphabet& alphabet, bool match) char header[4]{}; fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { - auto features = read_le_64(input); + auto features = OldBinary::read_u64(input); if (features >= TDF_UNKNOWN) { throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); } @@ -153,9 +153,6 @@ TransducerExe::read_serialised(FILE* input, Alphabet& alphabet, bool match) transition_count += count; for (uint64_t t = 0; t < count; t++) { int32_t tag = OldBinary::read_int(input, false); - if (match) { - tag -= alphabet.size(); - } uint64_t dest = OldBinary::read_int(input, false); double weight = OldBinary::read_double(input, false); if (match) { From 3bd79b46a21cf1c630517d337f253677e7c50a6c Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 9 Sep 2021 12:00:23 -0400 Subject: [PATCH 29/35] we need a way to make AlphabetExe reindex if we read in more strings --- lttoolbox/alphabet_exe.cc | 26 +++++++++++++++----------- lttoolbox/alphabet_exe.h | 2 ++ lttoolbox/string_writer.cc | 10 ++++++++++ lttoolbox/string_writer.h | 3 +++ 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 0ed9e5c7..545683c9 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -58,9 +58,7 @@ AlphabetExe::read(FILE* input, bool mmap, bool compressed) } // has to be a separate loop, otherwise the string_views get // invalidated when the StringWriter buffer expands - for (uint32_t i = 0; i < tag_count; i++) { - symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; - } + reindex(); int pairs = OldBinary::read_int(input); for (int i = 0; i < pairs; i++) { OldBinary::read_int(input, compressed); @@ -126,6 +124,19 @@ AlphabetExe::clearSymbol(const int32_t symbol) } } +void +AlphabetExe::reindex() +{ + symbol_map.clear(); + for (uint64_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + int32_t n = -tag_count-1; + for (auto& ds : dynamic_symbols) { + symbol_map[ds] = n--; + } +} + int32_t AlphabetExe::lookupDynamic(const UString& symbol) { @@ -143,14 +154,7 @@ AlphabetExe::lookupDynamic(const UString& symbol) if (rebuild) { // moderately horrible, but that's what we get for invalidating // all the views when dynamic_symbols gets reallocated - symbol_map.clear(); - for (uint64_t i = 0; i < tag_count; i++) { - symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; - } - int32_t n = -tag_count-1; - for (auto& ds : dynamic_symbols) { - symbol_map[ds] = n--; - } + reindex(); } } else { ret = it->second; diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index aceff2d3..df9b464a 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -40,6 +40,8 @@ class AlphabetExe { void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; bool isTag(const int32_t symbol) const; void clearSymbol(const int32_t symbol); + // call this after StringWriter buffer gets updated + void reindex(); // like operator() but add symbol to dynamic_symbols if not found int32_t lookupDynamic(const UString& symbol); }; diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 1d76bbb9..0f23be5d 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -34,6 +34,16 @@ StringWriter::add(UString_view s) return ret; } +StringRef +StringWriter::add_new(UString_view s) +{ + StringRef ret; + ret.start = edit_buffer.size(); + ret.count = s.size(); + edit_buffer += s; + return ret; +} + StringRef StringWriter::find(UString_view s) const { diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 300c4fd6..6adcdd47 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -35,6 +35,9 @@ class StringWriter { UChar* mmap_buffer; public: StringRef add(UString_view s); + // don't check for duplicates + // faster if you're not going to compare the StringRefs + StringRef add_new(UString_view s); StringRef find(UString_view s) const; UString_view get(const uint32_t start, const uint32_t count); UString_view get(const StringRef& ref); From c2ec4c512bf85f24b496dc38c48450ca816be721 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 10 Sep 2021 12:17:49 -0400 Subject: [PATCH 30/35] =?UTF-8?q?add=20-H=20option=20to=20lt-comp=20so=20i?= =?UTF-8?q?t=20doesn't=20eat=20=CE=B5=20(#92)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lttoolbox/att_compiler.cc | 8 +++++++- lttoolbox/att_compiler.h | 4 ++++ lttoolbox/lt_comp.cc | 11 +++++++++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index 184f62cb..eaa9416d 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -59,7 +59,7 @@ AttCompiler::convert_hfst(UString& symbol) { if (symbol == Transducer::HFST_EPSILON_SYMBOL_SHORT || symbol == Transducer::HFST_EPSILON_SYMBOL_LONG || - symbol == Transducer::LTTB_EPSILON_SYMBOL) { + (!hfstSymbols && symbol == Transducer::LTTB_EPSILON_SYMBOL)) { symbol.clear(); } else if (symbol == Transducer::HFST_SPACE_SYMBOL) { symbol = " "_u; @@ -443,3 +443,9 @@ AttCompiler::write(FILE *output) write_transducer_set(output, letters, alphabet, trans, true); } + +void +AttCompiler::setHfstSymbols(bool b) +{ + hfstSymbols = b; +} diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h index 557eb552..9da81174 100644 --- a/lttoolbox/att_compiler.h +++ b/lttoolbox/att_compiler.h @@ -84,8 +84,12 @@ class AttCompiler void write(FILE *fd) ; + void setHfstSymbols(bool b); + private: + bool hfstSymbols = false; + /** The final state(s). */ map finals; /** diff --git a/lttoolbox/lt_comp.cc b/lttoolbox/lt_comp.cc index 37e64a39..aade06e7 100644 --- a/lttoolbox/lt_comp.cc +++ b/lttoolbox/lt_comp.cc @@ -48,12 +48,14 @@ void endProgram(char *name) cout << " -a, --alt: set alternative (monodix)" << endl; cout << " -l, --var-left: set left language variant (bidix)" << endl; cout << " -r, --var-right: set right language variant (bidix)" << endl; + cout << " -H, --hfst: expect HFST symbols" << endl; #else cout << " -m: keep morpheme boundaries" << endl; cout << " -v: set language variant" << endl; cout << " -a: set alternative (monodix)" << endl; cout << " -l: set left language variant (bidix)" << endl; cout << " -r: set right language variant (bidix)" << endl; + cout << " -H: expect HFST symbols" << endl; #endif cout << "Modes:" << endl; cout << " lr: left-to-right compilation" << endl; @@ -89,14 +91,15 @@ int main(int argc, char *argv[]) {"var-left", required_argument, 0, 'l'}, {"var-right", required_argument, 0, 'r'}, {"keep-boundaries", no_argument, 0, 'm'}, + {"hfst", no_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'V'}, {0, 0, 0, 0} }; - int cnt=getopt_long(argc, argv, "a:v:l:r:mhV", long_options, &option_index); + int cnt=getopt_long(argc, argv, "a:v:l:r:mHhV", long_options, &option_index); #else - int cnt=getopt(argc, argv, "a:v:l:r:mhV"); + int cnt=getopt(argc, argv, "a:v:l:r:mHhV"); #endif if (cnt==-1) break; @@ -125,6 +128,10 @@ int main(int argc, char *argv[]) c.setKeepBoundaries(true); break; + case 'H': + a.setHfstSymbols(true); + break; + case 'V': c.setVerbose(true); break; From b80d92026f6660cb7105a1ff136c0a0003e88ef9 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 15 Mar 2022 17:27:23 -0400 Subject: [PATCH 31/35] helper functions are nice --- lttoolbox/Makefile.am | 4 +- lttoolbox/fst_processor.cc | 348 +++++++------------------------------ lttoolbox/fst_processor.h | 2 + lttoolbox/match_state2.cc | 31 ++-- lttoolbox/symbol_iter.cc | 102 +++++++++++ lttoolbox/symbol_iter.h | 46 +++++ 6 files changed, 222 insertions(+), 311 deletions(-) create mode 100644 lttoolbox/symbol_iter.cc create mode 100644 lttoolbox/symbol_iter.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 055c495b..159e648e 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -2,13 +2,13 @@ h_sources = alphabet.h alphabet_exe.h att_compiler.h binary_headers.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h file_utils.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h old_binary.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h symbol_iter.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc match_state2.cc node.cc old_binary.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc symbol_iter.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 187cddec..5ac60ff3 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -856,8 +857,6 @@ FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) } } - - void FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) { @@ -924,6 +923,26 @@ FSTProcessor::printSpace(UChar32 const val, UFILE *output) } } +void +FSTProcessor::writeChar(const UChar32 val, UFILE* output, bool single_blank) +{ + if(u_isspace(val)) { + if (single_blank) { + write(blankqueue.front(), output); + blankqueue.pop(); + } else { + printSpace(val, output); + } + } else { + if(isEscaped(val)) { + u_fputc('\\', output); + } + if (val) { + u_fputc(val, output); + } + } +} + bool FSTProcessor::isEscaped(UChar32 const c) const { @@ -1305,29 +1324,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) { if(!isAlphabetic(val) && sf.empty()) { - if(u_isspace(val)) - { - if (blankqueue.size() > 0) - { - write(blankqueue.front(), output); - blankqueue.pop(); - } - else - { - u_fputc(val, output); - } - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - if(val) - { - u_fputc(val, output); - } - } + writeChar(val, output, true); } else if(last_postblank) { @@ -1578,18 +1575,7 @@ FSTProcessor::tm_analysis(InputFile& input, UFILE *output) { if((u_isspace(val) || u_ispunct(val)) && sf.empty()) { - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + writeChar(val, output, false); } else if(!u_isspace(val) && !u_ispunct(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || @@ -2043,18 +2029,7 @@ FSTProcessor::postgeneration(InputFile& input, UFILE *output) input_buffer.setPos(last); input_buffer.back(2); val = lf[lf.size()-2]; - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + writeChar(val, output, false); } current_state = initial_state; @@ -2096,21 +2071,7 @@ FSTProcessor::intergeneration(InputFile& input, UFILE *output) if (skip_mode) { - if (u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(val != '\0') - { - if (isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } - } + writeChar(val, output, false); } else { @@ -2172,20 +2133,7 @@ FSTProcessor::intergeneration(InputFile& input, UFILE *output) else { for(unsigned int i=1; i(input_word[i]); + for (auto it = symbol_iter(input_word.substr(start, end-start+1), &alphabet); it != it.end(); ++it) { + if (current_state.size() != 0) { + current_state.step_case(*it, caseSensitive); } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast(input_word[i]); - } - if(current_state.size() != 0) - { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } - } - if(current_state.isFinal(all_finals)) - { + if (current_state.isFinal(all_finals)) { result.clear(); - if(with_delim) { + if (with_delim) { result += '^'; } - if(mark) { + if (mark) { result += '='; } result += current_state.filterFinals(all_finals, alphabet, @@ -2391,9 +2281,9 @@ FSTProcessor::biltransfull(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(!symbol.empty() && !result.empty()) + if(alphabet.isTag(*it) && !result.empty()) { - queue.append(symbol); + queue.append(it.string()); } else { @@ -2411,7 +2301,7 @@ FSTProcessor::biltransfull(UString const &input_word, bool with_delim) } } - if(start_point < (end_point - 3)) + if(start < (end - 3)) { return "^$"_u; } @@ -2488,44 +2378,10 @@ FSTProcessor::biltrans(UString const &input_word, bool with_delim) bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast(input_word[i]); - } + for (auto it = symbol_iter(input_word.substr(start_point, end_point-start_point+1), &alphabet); it != it.end(); it++) { if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(*it, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -2544,9 +2400,9 @@ FSTProcessor::biltrans(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(!symbol.empty() && !result.empty()) + if(alphabet.isTag(*it) && !result.empty()) { - queue.append(symbol); + queue.append(it.string()); } else { @@ -2775,14 +2631,7 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(val, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -2850,45 +2699,13 @@ FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val = 0; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = input_word[i]; - } - else if(input_word[i] == '<') - { + for (auto it = symbol_iter(input_word.substr(start_point, end_point-start_point+1), &alphabet); it != it.end(); it++) { + if (alphabet.isTag(*it)) { seentags = true; - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = input_word[i]; } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(*it, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -2907,9 +2724,9 @@ FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(!symbol.empty() && !result.empty()) + if(alphabet.isTag(*it) && !result.empty()) { - queue.append(symbol); + queue.append(it.string()); } else { @@ -3016,44 +2833,10 @@ FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast(input_word[i]); - } + for (auto it = symbol_iter(input_word.substr(start_point, end_point-start_point+1), &alphabet); it != it.end(); it++) { if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(*it, caseSensitive); } if(current_state.isFinal(all_finals)) { @@ -3072,7 +2855,7 @@ FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) if(current_state.size() == 0) { - if(symbol.empty()) + if(!alphabet.isTag(*it)) { // word is not present if(with_delim) @@ -3259,18 +3042,7 @@ FSTProcessor::SAO(InputFile& input, UFILE *output) { if(!isAlphabetic(val) && sf.empty()) { - if(u_isspace(val)) - { - printSpace(val, output); - } - else - { - if(isEscaped(val)) - { - u_fputc('\\', output); - } - u_fputc(val, output); - } + writeChar(val, output, false); } else if(last_incond) { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index f6778409..56159372 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -464,6 +464,8 @@ class FSTProcessor */ void printSpace(UChar32 const val, UFILE *output); + void writeChar(const UChar32 val, UFILE* output, bool single_blank); + void skipUntil(InputFile& input, UFILE *output, UChar32 const character); static UString removeTags(UString const &str); UString compoundAnalysis(UString str, bool uppercase, bool firstupper); diff --git a/lttoolbox/match_state2.cc b/lttoolbox/match_state2.cc index ddf25503..5a4dbc5d 100644 --- a/lttoolbox/match_state2.cc +++ b/lttoolbox/match_state2.cc @@ -16,6 +16,7 @@ */ #include +#include #include @@ -116,30 +117,18 @@ MatchState2::step(UString_view input, const AlphabetExe& alpha, bool foldcase) { int32_t any_char = alpha(""_u); int32_t any_tag = alpha(""_u); - for (uint64_t i = 0; i < input.size(); i++) { - if (input[i] == '<') { - for (uint64_t j = i+1; j < input.size(); j++) { - if (input[j] == '\\') { - j++; - } else if (input[j] == '>') { - int32_t sym = alpha(input.substr(i, j-i+1)); - if (sym) { - step(sym, any_tag); - } else { - step(any_tag); - } - i = j; - break; - } + for (auto it = symbol_iter(input, &alpha); it != it.end(); it++) { + if (it.string()[0] == '<') { + if (*it) { + step(*it, any_tag); + } else { + step(any_tag); } } else { - if (input[i] == '\\') { - i++; - } - if (foldcase && u_isupper(input[i])) { - step(input[i], u_tolower(input[i]), any_char); + if (foldcase && u_isupper(*it)) { + step(*it, u_tolower(*it), any_char); } else { - step(input[i], any_char); + step(*it, any_char); } } } diff --git a/lttoolbox/symbol_iter.cc b/lttoolbox/symbol_iter.cc new file mode 100644 index 00000000..c9e9048b --- /dev/null +++ b/lttoolbox/symbol_iter.cc @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +symbol_iter::symbol_iter(UString_view s_, const AlphabetExe* a_) : s(s_), a(a_) +{ + ++(*this); +} + +symbol_iter::symbol_iter(const symbol_iter& other) +{ + i = other.i; + j = other.j; + s = other.s; + a = other.a; + sym = other.sym; +} + +symbol_iter::~symbol_iter() {} + +int32_t +symbol_iter::operator*() const +{ + return sym; +} + +symbol_iter +symbol_iter::operator++(int) +{ + auto temp = *this; + ++(*this); + return temp; +} + +symbol_iter& +symbol_iter::operator++() +{ + if (i == s.size()) { + return *this; + } + i = j; + j++; + if (s[i] == '\\') { + j++; + sym = s[i+1]; + } else if (s[i] == '<') { + while (j < s.size() && s[j] != '>') j++; + j++; + sym = (*a)(s.substr(i, j)); + } else { + sym = s[i]; + } + return *this; +} + +bool +symbol_iter::operator!=(const symbol_iter& o) const +{ + return (i != o.i) || (j != o.j) || (s != o.s) || (a != o.a) || (sym != o.sym); +} + +bool +symbol_iter::operator==(const symbol_iter& o) const +{ + return (i == o.i) && (j == o.j) && (s == o.s) && (a == o.a) && (sym == o.sym); +} + +symbol_iter +symbol_iter::begin() +{ + return symbol_iter(s, a); +} + +symbol_iter +symbol_iter::end() +{ + symbol_iter ret(s, a); + ret.j = s.size(); + ++ret; + return ret; +} + +UString_view +symbol_iter::string() +{ + return s.substr(i,j); +} diff --git a/lttoolbox/symbol_iter.h b/lttoolbox/symbol_iter.h new file mode 100644 index 00000000..eebe6b3b --- /dev/null +++ b/lttoolbox/symbol_iter.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef __SYMBOL_ITER_H__ +#define __SYMBOL_ITER_H__ + +#include +#include + +class symbol_iter +{ +private: + size_t i = 0; + size_t j = 0; + UString_view s; + const AlphabetExe* a; + int32_t sym = 0; +public: + symbol_iter(UString_view s_, const AlphabetExe* a_); + symbol_iter(const symbol_iter& other); + ~symbol_iter(); + int32_t operator*() const; + symbol_iter operator++(int); + symbol_iter &operator++(); + bool operator!=(const symbol_iter& other) const; + bool operator==(const symbol_iter& other) const; + symbol_iter begin(); + symbol_iter end(); + UString_view string(); +}; + +#endif // __SYMBOL_ITER_H__ From ac01b01000dea7697d3f27648f2a44e075d3412c Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 1 Jul 2022 10:37:16 -0500 Subject: [PATCH 32/35] get tests passing --- lttoolbox/alphabet_exe.cc | 15 ++++++++++++--- lttoolbox/alphabet_exe.h | 4 ++++ lttoolbox/fst_processor.cc | 4 ++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 30a55258..7e575a0f 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -73,7 +73,9 @@ void* AlphabetExe::init(void* ptr) { mmapping = true; - tag_count = from_le_64(reinterpret_cast(ptr)[0]); + // TODO: why is from_le_64 segfaulting here? + //tag_count = from_le_64(reinterpret_cast(ptr)[0]); + tag_count = reinterpret_cast(ptr)[0]; tags = reinterpret_cast(ptr + sizeof(uint64_t)); for (uint64_t i = 0; i < tag_count; i++) { symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; @@ -98,6 +100,9 @@ AlphabetExe::getSymbol(UString& result, int32_t symbol, bool uppercase) const if (symbol == 0) { return; } else if (symbol < 0) { + if (clearedSymbols.find(symbol) != clearedSymbols.end()) { + return; + } int idx = -symbol-1; if (idx < tag_count) { result.append(sw->get(tags[idx])); @@ -121,8 +126,12 @@ void AlphabetExe::clearSymbol(const int32_t symbol) { if (symbol < 0) { - tags[-symbol-1].start = 0; - tags[-symbol-1].count = 0; + if (mmapping) { + clearedSymbols.insert(symbol); + } else { + tags[-symbol-1].start = 0; + tags[-symbol-1].count = 0; + } } } diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index 8282a0ed..8a693b93 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -20,6 +20,7 @@ #include #include +#include #include class AlphabetExe { @@ -31,6 +32,9 @@ class AlphabetExe { bool mmapping = false; // tags added at runtime - used by apertium-separable std::vector dynamic_symbols; + // tags that should not be printed, such as + // used by clearSymbol() if we're mmapping since we can't edit the data + std::set clearedSymbols; public: AlphabetExe(StringWriter* sw_); ~AlphabetExe(); diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index cd8d5a56..e059d0b1 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -729,7 +729,7 @@ FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) u_fputc('\\', output); } u_fputc(str[i], output); - if (str[i] == ' ') { + if (str[i] == ' ' && !blankqueue.empty()) { if (blankqueue.front() == " "_u) { blankqueue.pop(); } else { @@ -829,7 +829,7 @@ void FSTProcessor::writeChar(const UChar32 val, UFILE* output, bool single_blank) { if(u_isspace(val)) { - if (single_blank) { + if (single_blank && !blankqueue.empty()) { write(blankqueue.front(), output); blankqueue.pop(); } else { From d7199778068f95999d3799d91875f639da309f07 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 1 Jul 2022 10:48:47 -0500 Subject: [PATCH 33/35] python bindings need to know about string_view --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index 3d3bdad2..c0180b18 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -6,7 +6,7 @@ Setup for SWIG Python bindings for lttoolbox from distutils.core import Extension, setup from sys import platform -compile_args = '@CXXFLAGS@'.split() + '@ICU_CFLAGS@'.split() +compile_args = '@CXXFLAGS@'.split() + '@ICU_CFLAGS@'.split() + ['@have_sv@'] link_args = [] if platform == 'darwin': compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] From 620f51226b82aead7806f35472f9c7f1f8d3f6fd Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 1 Jul 2022 11:30:03 -0500 Subject: [PATCH 34/35] move reading back to file_utils --- lttoolbox/file_utils.cc | 104 +++++++++++++++++++++++++++++++++++++ lttoolbox/file_utils.h | 10 ++++ lttoolbox/fst_processor.cc | 90 +------------------------------- 3 files changed, 116 insertions(+), 88 deletions(-) diff --git a/lttoolbox/file_utils.cc b/lttoolbox/file_utils.cc index 5c92dcef..cc5ff5c5 100644 --- a/lttoolbox/file_utils.cc +++ b/lttoolbox/file_utils.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -174,3 +175,106 @@ readTransducerSet(FILE* input, std::set& letters, } letters = std::set(letters_str.begin(), letters_str.end()); } + +void readTransducerSet(FILE* input, + bool& mmapping, void* mmap_ptr, int& mmap_len, + StringWriter& str_write, + std::set* letters, AlphabetExe& alpha, + std::map& trans) +{ + bool mmap = false; + fpos_t pos; + if (fgetpos(input, &pos) == 0) { + char header[4]{}; + fread_unlocked(header, 1, 4, input); + if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { + auto features = read_le_64(input); + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); + } + mmap = features & LTF_MMAP; + } + else { + // Old binary format + fsetpos(input, &pos); + } + } + + if (mmap) { + fgetpos(input, &pos); + rewind(input); + mmapping = mmap_file(input, mmap_ptr, mmap_len); + if (mmapping) { + void* ptr = mmap_ptr + 12; + ptr = str_write.init(ptr); + + if (letters != nullptr) { + StringRef let_loc = reinterpret_cast(ptr)[0]; + std::vector vec; + ustring_to_vec32(str_write.get(let_loc), vec); + letters->insert(vec.begin(), vec.end()); + ptr += sizeof(StringRef); + } + + ptr = alpha.init(ptr); + + uint64_t tr_count = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + for (uint64_t i = 0; i < tr_count; i++) { + StringRef tn = reinterpret_cast(ptr)[0]; + ptr += sizeof(StringRef); + UString name = UString{str_write.get(tn)}; + ptr = trans[name].init(ptr); + } + } else { + fsetpos(input, &pos); + + str_write.read(input); + + if (letters != nullptr) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + std::vector vec; + ustring_to_vec32(str_write.get(s, c), vec); + letters->insert(vec.begin(), vec.end()); + } + + alpha.read(input, true); + + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{str_write.get(s, c)}; + trans[name].read(input); + } + } + } else { + uint64_t len; + + if (letters != nullptr) { + // letters + len = OldBinary::read_int(input); + while(len > 0) { + letters->insert(static_cast(OldBinary::read_int(input))); + len--; + } + } + + // symbols + fgetpos(input, &pos); + alpha.read(input, false); + fsetpos(input, &pos); + Alphabet temp; + temp.read(input); + + len = OldBinary::read_int(input); + + while(len > 0) { + UString name; + OldBinary::read_ustr(input, name); + trans[name].read_compressed(input, temp); + len--; + } + } +} diff --git a/lttoolbox/file_utils.h b/lttoolbox/file_utils.h index a2c81285..fae96b24 100644 --- a/lttoolbox/file_utils.h +++ b/lttoolbox/file_utils.h @@ -18,8 +18,11 @@ #ifndef __FILE_UTILS_H__ #include +#include +#include #include #include +#include #include @@ -37,4 +40,11 @@ void readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, std::map& trans); +// if letters == nullptr, then skip it (e.g. in lrx) +void readTransducerSet(FILE* input, + bool& mmapping, void* mmap_ptr, int& mmap_len, + StringWriter& str_write, + std::set* letters, AlphabetExe& alpha, + std::map& trans); + #endif // __FILE_UTILS_H__ diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index e059d0b1..2baa79f5 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -860,94 +860,8 @@ FSTProcessor::isAlphabetic(UChar32 const c) const void FSTProcessor::load(FILE *input) { - bool mmap = false; - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le_64(input); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(input, &pos); - } - } - - if (mmap) { - fgetpos(input, &pos); - rewind(input); - mmapping = mmap_file(input, mmap_pointer, mmap_len); - if (mmapping) { - void* ptr = mmap_pointer + 12; - ptr = str_write.init(ptr); - - StringRef let_loc = reinterpret_cast(ptr)[0]; - std::vector vec; - ustring_to_vec32(str_write.get(let_loc), vec); - alphabetic_chars.insert(vec.begin(), vec.end()); - ptr += sizeof(StringRef); - - ptr = alphabet.init(ptr); - - uint64_t tr_count = reinterpret_cast(ptr)[0]; - ptr += sizeof(uint64_t); - for (uint64_t i = 0; i < tr_count; i++) { - StringRef tn = reinterpret_cast(ptr)[0]; - ptr += sizeof(StringRef); - UString name = UString{str_write.get(tn)}; - ptr = transducers[name].init(ptr); - } - } else { - fsetpos(input, &pos); - - str_write.read(input); - - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - std::vector vec; - ustring_to_vec32(str_write.get(s, c), vec); - alphabetic_chars.insert(vec.begin(), vec.end()); - - alphabet.read(input, true); - - uint64_t tr_count = read_le_64(input); - for (uint64_t i = 0; i < tr_count; i++) { - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - UString name = UString{str_write.get(s, c)}; - transducers[name].read(input); - } - } - } else { - - // letters - uint64_t len = OldBinary::read_int(input); - while(len > 0) { - alphabetic_chars.insert(static_cast(OldBinary::read_int(input))); - len--; - } - - // symbols - fgetpos(input, &pos); - alphabet.read(input, false); - fsetpos(input, &pos); - Alphabet temp; - temp.read(input); - - len = OldBinary::read_int(input); - - while(len > 0) { - UString name; - OldBinary::read_ustr(input, name); - transducers[name].read_compressed(input, temp); - len--; - } - } + readTransducerSet(input, mmapping, mmap_pointer, mmap_len, + str_write, &alphabetic_chars, alphabet, transducers); } void From 01fd739ab7d4fc02caab211acd8ed663203f6f00 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 1 Jul 2022 12:24:04 -0500 Subject: [PATCH 35/35] header-reading util --- lttoolbox/Makefile.am | 2 +- lttoolbox/binary_headers.cc | 22 +++++++++++++++ lttoolbox/binary_headers.h | 5 +++- lttoolbox/file_utils.cc | 53 +++++++++++++------------------------ lttoolbox/transducer.cc | 28 ++++++-------------- 5 files changed, 54 insertions(+), 56 deletions(-) create mode 100644 lttoolbox/binary_headers.cc diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 067cec6e..d30ac14b 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -5,7 +5,7 @@ h_sources = alphabet.h alphabet_exe.h att_compiler.h binary_headers.h buffer.h c pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h symbol_iter.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h -cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ +cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc binary_headers.cc compression.cc entry_token.cc \ expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc match_state2.cc node.cc old_binary.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc symbol_iter.cc transducer.cc transducer_exe.cc \ diff --git a/lttoolbox/binary_headers.cc b/lttoolbox/binary_headers.cc new file mode 100644 index 00000000..2eab19af --- /dev/null +++ b/lttoolbox/binary_headers.cc @@ -0,0 +1,22 @@ +#include + +#include +#include + +bool +readHeader(FILE* input, const char* expect_header, uint64_t& feats) +{ + feats = 0; + fpos_t pos; + if (fgetpos(input, &pos) == 0) { + char header[4]{}; + auto r = fread_unlocked(header, 1, 4, input); + if (r == 4 && strncmp(header, expect_header, 4) == 0) { + feats = read_le_64(input); + return true; + } else { + fsetpos(input, &pos); + } + } + return false; +} diff --git a/lttoolbox/binary_headers.h b/lttoolbox/binary_headers.h index 878c7bce..da48a6d3 100644 --- a/lttoolbox/binary_headers.h +++ b/lttoolbox/binary_headers.h @@ -2,6 +2,7 @@ #define _LT_BINARY_HEADERS_ #include +#include // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; @@ -20,4 +21,6 @@ enum TD_FEATURES : uint64_t { TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits }; -#endif \ No newline at end of file +bool readHeader(FILE* input, const char* expect_header, uint64_t& feats); + +#endif diff --git a/lttoolbox/file_utils.cc b/lttoolbox/file_utils.cc index cc5ff5c5..37bdc067 100644 --- a/lttoolbox/file_utils.cc +++ b/lttoolbox/file_utils.cc @@ -115,22 +115,13 @@ readTransducerSet(FILE* input, std::set& letters, Alphabet& alpha, std::map& trans) { - fpos_t pos; + uint64_t features; bool mmap = false; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - auto r = fread_unlocked(header, 1, 4, input); - if (r == 4 && strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le_64(input); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(input, &pos); + if (readHeader(input, HEADER_LTTOOLBOX, features)) { + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } UString letters_str; @@ -176,31 +167,24 @@ readTransducerSet(FILE* input, std::set& letters, letters = std::set(letters_str.begin(), letters_str.end()); } -void readTransducerSet(FILE* input, - bool& mmapping, void* mmap_ptr, int& mmap_len, - StringWriter& str_write, - std::set* letters, AlphabetExe& alpha, - std::map& trans) +void +readTransducerSet(FILE* input, + bool& mmapping, void* mmap_ptr, int& mmap_len, + StringWriter& str_write, + std::set* letters, AlphabetExe& alpha, + std::map& trans) { + uint64_t features; bool mmap = false; - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le_64(input); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(input, &pos); - } + if (readHeader(input, HEADER_LTTOOLBOX, features)) { + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); + } + mmap = features & LTF_MMAP; } if (mmap) { + fpos_t pos; fgetpos(input, &pos); rewind(input); mmapping = mmap_file(input, mmap_ptr, mmap_len); @@ -262,6 +246,7 @@ void readTransducerSet(FILE* input, } // symbols + fpos_t pos; fgetpos(input, &pos); alpha.read(input, false); fsetpos(input, &pos); diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 68c2ce59..4289f9f0 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -594,22 +594,12 @@ Transducer::read(FILE *input, int const decalage) Transducer new_t; bool read_weights = false; - - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { - auto features = read_le(input); - if (features >= TDF_UNKNOWN) { - throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); - } - read_weights = (features & TDF_WEIGHTS); - } - else { - // Old binary format - fsetpos(input, &pos); - } + uint64_t features; + if (readHeader(input, HEADER_TRANSDUCER, features)) { + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + read_weights = (features & TDF_WEIGHTS); } new_t.initial = OldBinary::read_int(input, true); @@ -661,10 +651,8 @@ Transducer::read(FILE *input, int const decalage) void Transducer::read_mmap(FILE* in, Alphabet& alpha) { - char header[4]{}; - auto r = fread_unlocked(header, 1, 4, in); - if (r == 4 && strncmp(header, HEADER_TRANSDUCER, 4) == 0) { - auto features = read_le_64(in); + uint64_t features; + if (readHeader(in, HEADER_TRANSDUCER, features)) { if (features >= TDF_UNKNOWN) { throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); }