Skip to content

Commit

Permalink
Replace some std::set<int> with a sorted std::vector
Browse files Browse the repository at this point in the history
`Transducer::determinize()` doesn't do very much inserting into sets,
but does iterate over them and use them as map keys, both operations
which are substantially sped up by ensuring that the elements of the
set are contiguous. Thus this commit adds `sorted_vector` which is a wrapper
around `std::vector` that ensures that inserted elements are sorted
and unique. The results are significant improvements in both runtime
and memory usage for transducer minimization.

Effect on `lt-comp`:

|          | eng           | oci           |
|----------|---------------|---------------|
| orig     | 136 MB 23.9 s |  980 MB 219 s |
| pre-comp | 160 MB 16.8 s | 1120 MB 163 s |
| VecSet   |  99 MB 12.7 s |  800 MB 116 s |
| net diff | -27%   -47%   | -18%    -47%  |
  • Loading branch information
mr-martian committed Jul 14, 2022
1 parent 9111665 commit 3edf7c0
Show file tree
Hide file tree
Showing 3 changed files with 286 additions and 9 deletions.
2 changes: 1 addition & 1 deletion lttoolbox/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \
match_exe.h match_node.h match_state.h my_stdio.h node.h \
pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \
transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \
ustring.h
ustring.h sorted_vector.hpp
cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \
match_node.cc match_state.cc node.cc pattern_list.cc \
Expand Down
277 changes: 277 additions & 0 deletions lttoolbox/sorted_vector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
/*
* Copyright (C) 2022 Apertium
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses/>.
*/

#pragma once
#ifndef c6d28b7452ec699b_SORTED_VECTOR_HPP
#define c6d28b7452ec699b_SORTED_VECTOR_HPP
#include <set>
#include <vector>
#include <algorithm>
#include <functional>

namespace detail {
template<typename ForwardIt, typename Comp>
bool is_sorted(ForwardIt first, ForwardIt last, Comp comp) {
if (first != last) {
ForwardIt next = first;
while (++next != last) {
if (comp(*next, *first)) {
return false;
}
first = next;
}
}
return true;
}
}

template<typename T, typename Comp = std::less<T>>
class sorted_vector {
public:
typedef typename std::vector<T> container;
typedef typename container::iterator iterator;
typedef typename container::const_iterator const_iterator;
typedef typename container::const_reverse_iterator const_reverse_iterator;
typedef typename container::size_type size_type;
typedef T value_type;
typedef T key_type;

sorted_vector() {}

sorted_vector(const std::set<T>& o) {
insert(o.begin(), o.end());
}

std::pair<iterator, bool> insert(T t) {
if (elements.empty()) {
elements.push_back(t);
return std::make_pair(elements.begin(), true);
}
iterator it = std::lower_bound(elements.begin(), elements.end(), t, comp);
size_t at = std::distance(elements.begin(), it);
if (it == elements.end() || comp(*it, t) || comp(t, *it)) {
elements.insert(it, t);
return std::make_pair(elements.begin() + at, true);
}
return std::make_pair(elements.begin() + at, false);
}

template<typename It>
void insert(It b, It e) {
size_t d = std::distance(b, e);
if (d == 1) {
insert(*b);
return;
}

static thread_local container merged;
merged.resize(0);
merged.reserve(elements.size() + d);

if (detail::is_sorted(b, e, comp)) {
std::merge(elements.begin(), elements.end(), b, e, std::back_inserter(merged), comp);
}
else {
static thread_local container sorted;
sorted.assign(b, e);
std::sort(sorted.begin(), sorted.end(), comp);
std::merge(elements.begin(), elements.end(), sorted.begin(), sorted.end(), std::back_inserter(merged), comp);
}

merged.swap(elements);
auto it = std::unique(elements.begin(), elements.end());
elements.erase(it, elements.end());
}

void push_back(T t) {
insert(t);
}

bool erase(T t) {
if (elements.empty()) {
return false;
}
if (comp(elements.back(), t)) {
return false;
}
if (comp(t, elements.front())) {
return false;
}
auto it = lower_bound(t);
if (it != elements.end() && !comp(*it, t) && !comp(t, *it)) {
elements.erase(it);
return true;
}
return false;
}

const_iterator erase(const_iterator it) {
size_type o = std::distance<const_iterator>(elements.begin(), it);
return elements.erase(elements.begin() + o);
}

template<typename It>
void erase(It b, It e) {
for (; b != e; ++b) {
erase(*b);
}
}

const_iterator find(T t) const {
if (elements.empty()) {
return elements.end();
}
if (comp(elements.back(), t)) {
return elements.end();
}
if (comp(t, elements.front())) {
return elements.end();
}
auto it = lower_bound(t);
if (it != elements.end() && (comp(*it, t) || comp(t, *it))) {
return elements.end();
}
return it;
}

size_t count(T t) const {
return (find(t) != end());
}

iterator begin() {
return elements.begin();
}

iterator end() {
return elements.end();
}

const_iterator begin() const {
return elements.begin();
}

const_iterator end() const {
return elements.end();
}

const_iterator cbegin() const {
return elements.cbegin();
}

const_iterator cend() const {
return elements.cend();
}

const_reverse_iterator rbegin() const {
return elements.rbegin();
}

const_reverse_iterator rend() const {
return elements.rend();
}

T front() const {
return elements.front();
}

T back() const {
return elements.back();
}

iterator lower_bound(T t) {
return std::lower_bound(elements.begin(), elements.end(), t, comp);
}

const_iterator lower_bound(T t) const {
return std::lower_bound(elements.begin(), elements.end(), t, comp);
}

const_iterator upper_bound(T t) const {
return std::upper_bound(elements.begin(), elements.end(), t, comp);
}

bool intersects(const sorted_vector<T>& other) const {
auto ti = begin();
auto oi = other.begin();
auto te = end();
auto oe = other.end();
while (ti != te && oi != oe) {
if (*ti == *oi) {
return true;
}
else if (comp(*ti, *oi)) {
++ti;
}
else {
++oi;
}
}
return false;
}

size_type size() const {
return elements.size();
}

size_type capacity() const {
return elements.capacity();
}

bool empty() const {
return elements.empty();
}

template<typename It>
void assign(It b, It e) {
clear();
insert(b, e);
}

void assign(const_iterator b, const_iterator e) {
elements.assign(b, e);
}

void swap(sorted_vector& other) {
elements.swap(other.elements);
}

void clear() {
elements.clear();
}

void sort() {
std::sort(elements.begin(), elements.end(), Comp());
}

void pop_back() {
elements.pop_back();
}

container& get() {
return elements;
}

bool operator<(const sorted_vector<T>& o) const {
return elements < o.elements;
}

private:
container elements;
Comp comp;
};

#endif
16 changes: 8 additions & 8 deletions lttoolbox/transducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <lttoolbox/my_stdio.h>
#include <lttoolbox/deserialiser.h>
#include <lttoolbox/serialiser.h>
#include <lttoolbox/sorted_vector.hpp>

#include <cstdlib>
#include <iostream>
Expand Down Expand Up @@ -314,16 +315,16 @@ Transducer::isEmptyIntersection(std::set<int> const &s1, std::set<int> const &s2
void
Transducer::determinize(int const epsilon_tag)
{
std::vector<std::set<int> > R(2);
std::vector<std::set<int>> Q_prime;
std::map<std::set<int>, int> Q_prime_inv;
std::vector<sorted_vector<int>> R(2);
std::vector<sorted_vector<int>> Q_prime;
std::map<sorted_vector<int>, int> Q_prime_inv;

std::map<int, std::multimap<int, std::pair<int, double> > > transitions_prime;

// We're almost certainly going to need the closure of (nearly) every
// state, and we're often going to need the closure several times,
// so it's faster to precompute (though it does slow things down a bit).
std::vector<std::set<int>> all_closures;
std::vector<sorted_vector<int>> all_closures;
all_closures.reserve(transitions.size());
for (size_t i = 0; i < transitions.size(); i++) {
all_closures.push_back(closure(i, epsilon_tag));
Expand All @@ -345,7 +346,7 @@ Transducer::determinize(int const epsilon_tag)

int t = 0;

std::set<int> finals_state;
sorted_vector<int> finals_state;
for(auto& it : finals) {
finals_state.insert(it.first);
}
Expand All @@ -357,8 +358,7 @@ Transducer::determinize(int const epsilon_tag)

for(auto& it : R[t])
{
if(!isEmptyIntersection(Q_prime[it], finals_state))
{
if (Q_prime[it].intersects(finals_state)) {
double w = default_weight;
auto it3 = finals.find(it);
if (it3 != finals.end()) {
Expand All @@ -367,7 +367,7 @@ Transducer::determinize(int const epsilon_tag)
finals_prime.insert(std::make_pair(it, w));
}

std::map<std::pair<int, double>, std::set<int> > mymap;
std::map<std::pair<int, double>, sorted_vector<int> > mymap;

for(auto& it2 : Q_prime[it])
{
Expand Down

0 comments on commit 3edf7c0

Please sign in to comment.