From 042b085f2afd1b46c003c36e5d795e9ec16a2338 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 12 Sep 2024 10:11:45 -0400 Subject: [PATCH] Add filterFinalsArray() and refactor biltrans This allows biltrans code to be cleaner, since it doesn't have to deal with splitting and re-merging the result string, and also causes it to handle tags with slashes more correctly. --- CMakeLists.txt | 2 +- lttoolbox/fst_processor.cc | 160 +++++++++++-------------------------- lttoolbox/fst_processor.h | 6 +- lttoolbox/state.cc | 125 ++++++++++++----------------- lttoolbox/state.h | 14 ++++ tests/data/slash-tags.dix | 15 ++++ tests/lt_proc/__init__.py | 17 ++++ 7 files changed, 147 insertions(+), 192 deletions(-) create mode 100644 tests/data/slash-tags.dix diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c156ae..27cd6ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR) cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}) project(lttoolbox - VERSION 3.7.14 + VERSION 3.7.15 LANGUAGES CXX C ) set(VERSION ${PROJECT_VERSION}) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 643afd6..61f2044 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -1711,8 +1711,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output) } bool -FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue, - bool delim, bool mark) +FSTProcessor::step_biltrans(UStringView word, std::vector& result, UString& queue) { State current_state = initial_state; bool firstupper = u_isupper(word[0]); @@ -1723,13 +1722,11 @@ FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue, current_state.step(val, beCaseSensitive(current_state)); } if (current_state.isFinal(all_finals)) { - result.clear(); - if (delim) result += '^'; - if (mark) result += '='; - result += current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0).substr(1); + current_state.filterFinalsArray(result, + all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0); } if (current_state.size() == 0) { if (!result.empty()) queue.append(symbol); @@ -1742,7 +1739,7 @@ FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue, UString FSTProcessor::biltransfull(UStringView input_word, bool with_delim) { - UString result; + std::vector result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; UString queue; @@ -1766,7 +1763,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim) } auto word = input_word.substr(start_point, end_point-start_point); - bool exists = step_biltrans(word, result, queue, with_delim, mark); + bool exists = step_biltrans(word, result, queue); if (!exists) { if (with_delim) return "^@"_u + US(input_word.substr(1)); else return "@"_u + US(input_word); @@ -1778,23 +1775,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim) } // attach unmatched queue automatically - if(!queue.empty()) - { - UString result_with_queue = compose(result, queue); - if(with_delim) - { - result_with_queue += '$'; - } - return result_with_queue; - } - else - { - if(with_delim) - { - result += '$'; - } - return result; - } + return compose(result, queue, with_delim, mark); } @@ -1803,7 +1784,7 @@ UString FSTProcessor::biltrans(UStringView input_word, bool with_delim) { State current_state = initial_state; - UString result; + std::vector result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; UString queue; @@ -1827,7 +1808,7 @@ FSTProcessor::biltrans(UStringView input_word, bool with_delim) } UStringView word = input_word.substr(start_point, end_point-start_point); - bool exists = step_biltrans(word, result, queue, with_delim, mark); + bool exists = step_biltrans(word, result, queue); if (!exists) { if (with_delim) return "^@"_u + US(input_word.substr(1)); else return "@"_u + US(input_word); @@ -1835,47 +1816,24 @@ FSTProcessor::biltrans(UStringView input_word, bool with_delim) // attach unmatched queue automatically - if(!queue.empty()) - { - UString result_with_queue = compose(result, queue); - if(with_delim) - { - result_with_queue += '$'; - } - return result_with_queue; - } - else - { - if(with_delim) - { - result += '$'; - } - return result; - } + return compose(result, queue, with_delim, mark); } UString -FSTProcessor::compose(UStringView lexforms, UStringView queue) const +FSTProcessor::compose(const std::vector& lexforms, UStringView queue, + bool delim, bool mark) const { UString result; - result.reserve(lexforms.size() + 2 * queue.size()); - result += '/'; - - for(unsigned int i = 1; i< lexforms.size(); i++) - { - if(lexforms[i] == '\\') - { - result += '\\'; - i++; - } - else if(lexforms[i] == '/') - { - result.append(queue); - } - result += lexforms[i]; - } - - result += queue; + if (delim) result += '^'; + if (mark) result += '='; + bool first = true; + for (auto& it : lexforms) { + if (!first) result += '/'; + first = false; + result += it; + result += queue; + } + if (delim) result += '$'; return result; } @@ -2060,16 +2018,17 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) bool seenTags = false; size_t queue_start = 0; - UString result; + std::vector result; for (size_t i = 0; i < symbols.size(); i++) { seenTags = seenTags || alphabet.isTag(symbols[i]); current_state.step_case(symbols[i], beCaseSensitive(current_state)); if (current_state.isFinal(all_finals)) { queue_start = i; - result = current_state.filterFinals(all_finals, alphabet, escaped_chars, - displayWeightsMode, maxAnalyses, - maxWeightClasses, uppercase, - firstupper, 0); + current_state.filterFinalsArray(result, + all_finals, alphabet, escaped_chars, + displayWeightsMode, maxAnalyses, + maxWeightClasses, uppercase, + firstupper, 0); } } // if there are no tags, we only return complete matches @@ -2084,11 +2043,11 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) } write(source, output); + u_fputc('/', output); if (!result.empty()) { write(compose(result, source.substr(queue_pos)), output); } else { - u_fputc('/', output); u_fputc((mode == gm_all ? '#' : '@'), output); write(source, output); } @@ -2100,7 +2059,8 @@ std::pair FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) { State current_state = initial_state; - UString result; + std::vector result; + std::vector temp; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; UString queue; @@ -2142,17 +2102,10 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result.clear(); - if (with_delim) { - result += '^'; - } - if (mark) { - result += '='; - } - result += current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0).substr(1); + current_state.filterFinalsArray(result, all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0); } if(current_state.size() == 0) @@ -2166,13 +2119,12 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) // word is not present if(with_delim) { - result = "^@"_u + US(input_word.substr(1)); + return {"^@"_u + US(input_word.substr(1)), 0}; } else { - result = "@"_u + US(input_word); + return {"@"_u + US(input_word), 0}; } - return std::pair(result, 0); } } } @@ -2185,43 +2137,25 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) // word is not present if(with_delim) { - result = "^@"_u + US(input_word.substr(1)); + return {"^@"_u + US(input_word.substr(1)), 0}; } else { - result = "@"_u + US(input_word); + return {"@"_u + US(input_word), 0}; } - return {result, 0}; } // attach unmatched queue automatically - - if(!queue.empty()) - { - UString result_with_queue = compose(result, queue); - if(with_delim) - { - result_with_queue += '$'; - } - return {result_with_queue, queue.size()}; - } - else - { - if(with_delim) - { - result += '$'; - } - return {result, 0}; - } + return {compose(result, queue, with_delim, mark), queue.size()}; } UString FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) { State current_state = initial_state; - UString result; + std::vector result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; bool mark = false; @@ -2245,17 +2179,13 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) auto word = input_word.substr(start_point, end_point-start_point); UString queue; - bool exists = step_biltrans(word, result, queue, with_delim, mark); + bool exists = step_biltrans(word, result, queue); if (!exists || !queue.empty()) { if (with_delim) return "^@"_u + US(input_word.substr(1)); else return "@"_u + US(input_word); } - if(with_delim) - { - result += '$'; - } - return result; + return compose(result, ""_u, with_delim, mark); } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 940789a..edc3969 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -429,9 +429,9 @@ class FSTProcessor void analysis_wrapper_null_flush(InputFile& input, UFILE *output); void generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode); - UString compose(UStringView lexforms, UStringView queue) const; - bool step_biltrans(UStringView word, UString& result, UString& queue, - bool delim, bool mark); + UString compose(const std::vector& lexforms, UStringView queue, + bool delim = false, bool mark = false) const; + bool step_biltrans(UStringView word, std::vector& result, UString& queue); void procNodeICX(); void procNodeRCX(); diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index 39bba34..d09f84f 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -484,93 +484,72 @@ State::NFinals(std::vector> lf, int maxAnalyses, int return result; } - -UString -State::filterFinals(std::map const &finals, - Alphabet const &alphabet, - std::set const &escaped_chars, - bool display_weights, int max_analyses, int max_weight_classes, - bool uppercase, bool firstupper, int firstchar) const +void +State::filterFinalsArray(std::vector& result, + std::map const &finals, + Alphabet const &alphabet, + std::set const &escaped_chars, + bool display_weights, + int max_analyses, int max_weight_classes, + bool uppercase, bool firstupper, int firstchar) const { std::vector> response; - - UString result; + UString temp; double cost = 0.0000; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) - { - if(state[i].dirty) - { - result.clear(); - cost = 0.0000; - unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) - { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) - { - result += '\\'; - } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); - cost += ((*(state[i].sequence))[j]).second; - } - if(firstupper) - { - if(result[first_char] == '~') - { - // skip post-generation mark - result[first_char+1] = u_toupper(result[first_char+1]); - } - else - { - result[first_char] = u_toupper(result[first_char]); - } - } - } - else - { - result.clear(); - cost = 0.0000; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) - { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) - { - result += '\\'; - } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first); - cost += ((*(state[i].sequence))[j]).second; - } - } - - // Add the weight of the final state - cost += (*(finals.find(state[i].where))).second; - response.push_back({result, cost}); + for (auto& it : state) { + auto fin = finals.find(it.where); + if (fin == finals.end()) continue; + temp.clear(); + cost = fin->second; + for (auto& step : *(it.sequence)) { + if (escaped_chars.find(step.first) != escaped_chars.end()) temp += '\\'; + alphabet.getSymbol(temp, step.first, it.dirty && uppercase); + cost += step.second; + } + if (it.dirty && firstupper) { + int loc = firstchar; + if (temp[loc] == '~') loc++; // skip post-generation mark + temp[loc] = u_toupper(temp[loc]); } + response.push_back({temp, cost}); } response = NFinals(response, max_analyses, max_weight_classes); result.clear(); - std::set seen; - for(auto it = response.begin(); it != response.end(); it++) - { - if(seen.find(it->first) != seen.end()) { - continue; - } - seen.insert(it->first); - result += '/'; - result += it->first; - if(display_weights) - { - UChar temp[16]{}; + sorted_vector seen; + for (auto& it : response) { + if (!seen.insert(it.first).second) continue; + result.push_back(it.first); + if (display_weights) { + UChar w[16]{}; // if anyone wants a weight of 10000, this will not be enough - u_sprintf(temp, "", it->second); - result += temp; + u_sprintf(w, "", it.second); + result.back() += w; } } +} - return result; +UString +State::filterFinals(std::map const &finals, + Alphabet const &alphabet, + std::set const &escaped_chars, + bool display_weights, int max_analyses, int max_weight_classes, + bool uppercase, bool firstupper, int firstchar) const +{ + std::vector result; + filterFinalsArray(result, finals, alphabet, escaped_chars, display_weights, + max_analyses, max_weight_classes, uppercase, firstupper, + firstchar); + + UString ret; + for (auto& it : result) { + ret += '/'; + ret += it; + } + + return ret; } diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 5d46c52..56a7d34 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -281,6 +281,20 @@ class State bool firstupper = false, int firstchar = 0) const; + /** + * filterFinals(), but write the results into `result` + */ + void filterFinalsArray(std::vector& result, + std::map const &finals, + Alphabet const &a, + std::set const &escaped_chars, + bool display_weights = false, + int max_analyses = INT_MAX, + int max_weight_classes = INT_MAX, + bool uppercase = false, + bool firstupper = false, + int firstchar = 0) const; + /** * Same as previous one, but the output is adapted to the SAO system * @param finals the set of final nodes diff --git a/tests/data/slash-tags.dix b/tests/data/slash-tags.dix new file mode 100644 index 0000000..3dcb552 --- /dev/null +++ b/tests/data/slash-tags.dix @@ -0,0 +1,15 @@ + + + + + + +
+ +

+ *lobwana1.1 + *lopwana1.1 +

+
+
+
\ No newline at end of file diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index 6429c2d..0fe9c74 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -479,5 +479,22 @@ class BiltransGarbage(ProcTest): inputs = ['^$'] expectedOutputs = ['^$'] +class SlashesInTags(ProcTest): + procdix = 'data/slash-tags.dix' + procflags = ['-b', '-z'] + procdir = 'lr' + inputs = ['^\\*lobwana1.1<1/2>$', + '^\\*lobwana1.1<3/4>$', + '^\\*lobwana1.1<1/2>$', + '^\\*lobwana1.1<3/4>$', + '^\\*lobwana1.1<1/2>$', + '^\\*lobwana1.1<3/4>$'] + expectedOutputs = ['^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$', + '^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$', + '^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$', + '^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$', + '^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$', + '^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$'] + # These fail on some systems: #from null_flush_invalid_stream_format import *