From 3542261261ae1213c6e038a0ad38f17a10b4e80d Mon Sep 17 00:00:00 2001 From: Tanmai Khanna Date: Wed, 22 Jul 2020 19:18:28 +0530 Subject: [PATCH] Wordbound blank handling in post generation (#102) * Wordbound blanks merge when words merge. * Wordbound blanks apply to all output words when output of postgen rule are more than input words. * No regression for postgeneration without wordbound blanks. * Lots of tests added. --- lttoolbox/fst_processor.cc | 176 ++++++++++++++++++++++++++++++++++--- lttoolbox/fst_processor.h | 46 +++++++++- tests/data/postgen.dix | 98 +++++++++++++++++++++ tests/lt_proc/__init__.py | 54 ++++++++++++ 4 files changed, 363 insertions(+), 11 deletions(-) create mode 100644 tests/data/postgen.dix diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index d462341a..42decd4f 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -274,6 +274,61 @@ FSTProcessor::readWblank(FILE *input) return result; } +bool +FSTProcessor::wblankPostGen(FILE *input, FILE *output) +{ + wstring result = L""; + result += L"[["; + wchar_t c = 0; + + while(!feof(input)) + { + c = static_cast(fgetwc_unlocked(input)); + result += c; + + if(c == L'\\') + { + result += c; + result += static_cast(fgetwc_unlocked(input)); + } + else if(c == L']') + { + c = static_cast(fgetwc_unlocked(input)); + result += c; + + if(c == L']') + { + int resultlen = result.size(); + if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]] + { + fputws(result.c_str(), output); + break; + } + else + { + c = static_cast(fgetwc_unlocked(input)); + if(c == L'~') + { + wblankqueue.push(result); + return true; + } + else + { + result += c; + } + } + } + } + } + + if(c != L']') + { + streamError(); + } + + return false; +} + int FSTProcessor::readAnalysis(FILE *input) { @@ -425,7 +480,7 @@ FSTProcessor::readTMAnalysis(FILE *input) } int -FSTProcessor::readPostgeneration(FILE *input) +FSTProcessor::readPostgeneration(FILE *input, FILE *output) { if(!input_buffer.isEmpty()) { @@ -434,6 +489,7 @@ FSTProcessor::readPostgeneration(FILE *input) wchar_t val = static_cast(fgetwc_unlocked(input)); int altval = 0; + is_wblank = false; if(feof(input)) { return 0; @@ -451,17 +507,31 @@ FSTProcessor::readPostgeneration(FILE *input) if(val == L'[') { - blankqueue.push(readWblank(input)); + if(collect_wblanks) + { + wblankqueue.push(readWblank(input)); + is_wblank = true; + return static_cast(L' '); + } + else if(wblankPostGen(input, output)) + { + return static_cast(L'~'); + } + else + { + is_wblank = true; + return static_cast(L' '); + } } else { ungetwc_unlocked(val, input); blankqueue.push(readFullBlock(input, L'[', L']')); + + input_buffer.add(static_cast(L' ')); + return static_cast(L' '); } - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); - case L'\\': val = static_cast(fgetwc_unlocked(input)); if(escaped_chars.find(val) == escaped_chars.end()) @@ -732,6 +802,59 @@ FSTProcessor::flushBlanks(FILE *output) } } +void +FSTProcessor::flushWblanks(FILE *output) +{ + while(wblankqueue.size() > 0) + { + fputws_unlocked(wblankqueue.front().c_str(), output); + wblankqueue.pop(); + } +} + +wstring +FSTProcessor::combineWblanks() +{ + wstring final_wblank; + wstring last_wblank = L""; + + while(wblankqueue.size() > 0) + { + if(wblankqueue.front().compare(L"[[/]]") == 0) + { + if(final_wblank.empty()) + { + final_wblank += L"[["; + } + else if(final_wblank.size() > 2) + { + final_wblank += L"; "; + } + + final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]] + last_wblank.clear(); + } + else + { + last_wblank = wblankqueue.front(); + } + wblankqueue.pop(); + } + + if(!last_wblank.empty()) + { + wblankqueue.push(last_wblank); + } + + if(!final_wblank.empty()) + { + final_wblank += L"]]"; + need_end_wblank = true; + } + + return final_wblank; +} + void FSTProcessor::calcInitial() { @@ -2093,36 +2216,65 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } bool skip_mode = true; + collect_wblanks = false; + need_end_wblank = false; State current_state = initial_state; wstring lf = L""; wstring sf = L""; int last = 0; set empty_escaped_chars; - while(wchar_t val = readPostgeneration(input)) + while(wchar_t val = readPostgeneration(input, output)) { if(val == L'~') { skip_mode = false; + collect_wblanks = true; } - if(skip_mode) + if(is_wblank && skip_mode) + { + //do nothing + } + else if(skip_mode) { if(iswspace(val)) { + if(need_end_wblank) + { + fputws_unlocked(L"[[/]]", output); + need_end_wblank = false; + } + printSpace(val, output); } else { + if(!need_end_wblank) + { + flushWblanks(output); + } + if(isEscaped(val)) { fputwc_unlocked(L'\\', output); } fputwc_unlocked(val, output); + + if(need_end_wblank) + { + fputws_unlocked(L"[[/]]", output); + need_end_wblank = false; + } } } else { + if(is_wblank) + { + continue; + } + // test for final states if(current_state.isFinal(all_finals)) { @@ -2199,6 +2351,9 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } else { + wstring final_wblank = combineWblanks(); + fputws_unlocked(final_wblank.c_str(), output); + if(lf == L"") { unsigned int mark = sf.size(); @@ -2239,11 +2394,12 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) fputwc_unlocked(val, output); } } - + current_state = initial_state; lf = L""; sf = L""; skip_mode = true; + collect_wblanks = false; } } } @@ -2269,7 +2425,7 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) while (true) { - wchar_t val = readPostgeneration(input); + wchar_t val = readPostgeneration(input, output); if (val == L'~') { @@ -2414,7 +2570,7 @@ FSTProcessor::transliteration(FILE *input, FILE *output) wstring sf = L""; int last = 0; - while(wchar_t val = readPostgeneration(input)) + while(wchar_t val = readPostgeneration(input, output)) { if(iswpunct(val) || iswspace(val)) { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 32761513..9d1e62e4 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -103,6 +103,11 @@ class FSTProcessor */ queue blankqueue; + /** + * Queue of wordbound blanks, used in reading methods + */ + queue wblankqueue; + /** * Set of characters being considered alphabetics */ @@ -226,6 +231,21 @@ class FSTProcessor * Output no more than 'N' number of weighted analyses */ int maxAnalyses; + + /** + * True if a wblank block ([[..]]xyz[[/]]) was just read + */ + bool is_wblank; + + /** + * True if skip_mode is false and need to collect wblanks + */ + bool collect_wblanks; + + /** + * True if a wblank has been processed for postgen and we need an ending wblank + */ + bool need_end_wblank; /** * Output no more than 'N' best weight classes @@ -257,6 +277,14 @@ class FSTProcessor * @param input the stream being read */ wstring readWblank(FILE *input); + + /** + * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] + * @param input the stream being read + * @param output the stream to write on + * @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation + */ + bool wblankPostGen(FILE *input, FILE *output); /** * Returns true if the character code is identified as alphabetic @@ -282,6 +310,7 @@ class FSTProcessor /** * Read text from stream (decomposition version) * @param input the stream to read + * @param output the stream to write on * @return the next symbol in the stream */ int readDecomposition(FILE *input, FILE *output); @@ -289,13 +318,15 @@ class FSTProcessor /** * Read text from stream (postgeneration version) * @param input the stream to read + * @param output the stream to write on * @return the next symbol in the stream */ - int readPostgeneration(FILE *input); + int readPostgeneration(FILE *input, FILE *output); /** * Read text from stream (generation version) * @param input the stream to read + * @param output the stream being written to * @return the next symbol in the stream */ int readGeneration(FILE *input, FILE *output); @@ -303,6 +334,7 @@ class FSTProcessor /** * Read text from stream (biltrans version) * @param input the stream to read + * @param output the stream to write on * @return the queue of 0-symbols, and the next symbol in the stream */ pair readBilingual(FILE *input, FILE *output); @@ -319,6 +351,18 @@ class FSTProcessor * @param output stream to write blanks */ void flushBlanks(FILE *output); + + /** + * Flush all the wordbound blanks remaining in the current process + * @param output stream to write blanks + */ + void flushWblanks(FILE *output); + + /** + * Combine wordbound blanks in the queue and return them + * @return final wblank string + */ + wstring combineWblanks(); /** * Calculate the initial state of parsing diff --git a/tests/data/postgen.dix b/tests/data/postgen.dix new file mode 100644 index 00000000..1bea34b0 --- /dev/null +++ b/tests/data/postgen.dix @@ -0,0 +1,98 @@ + + + + + + + + + + + + +

+ el + l +

+
+ +

+ el + l +

+
+
+ + + +

+ la + sela +

+
+ +

+ las + selas +

+
+ +

+ lo + selo +

+
+ +

+ los + selos +

+
+
+ + +
+ +
+ + +

+ de + de +

+ +
+ + +

+ oho + uho +

+ + + +

+ le + +

+ + + + +

+ les + lepetest +

+ + + +

+ lespes + lespestest +

+ + +
+ +
+ diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index 5455c485..a4e07095 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -139,5 +139,59 @@ class WordboundBlankAnalysisTest(unittest.TestCase, ProcTest): "[[t:b:456123; t:i:90hfbn]]^legge/legge$ [[t:s:xyz789]]^opp/opp$ ^opp/opp$ [[t:b:abc124]]^x/*x$ ^opp/opp$^./.$", ] +class PostgenerationBasicTest(unittest.TestCase, ProcTest): + procdix = "data/postgen.dix" + procflags = ["-p", "-z"] + inputs = [ "xyz ejemplo ~o ho nombre.", + "xyz ~le la pelota.", + "El perro ~de el amigo.", + "abc ~les testword"] + expectedOutputs = [ "xyz ejemplo u ho nombre.", + "xyz se la pelota.", + "El perro del amigo.", + "abc le pe test testword"] + +class PostgenerationWordboundBlankTest(unittest.TestCase, ProcTest): + procdix = "data/postgen.dix" + procflags = ["-p", "-z"] + inputs = [ "xyz ejemplo [[t:i:123456]]~o[[/]] [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "xyz ejemplo [[t:b:poim230]]~o[[/]] ho [[t:i:mnbj203]]nombre[[/]].", + "xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] ~le la [[t:b:iopmnb]]nombre[[/]].", + "xyz ejemplo [[t:i:1235gb]]~o[[/]] [[t:b:abc123; t:i:123456]]ho[[/]] [[t:b:i4x56fb]]~le[[/]] la nombre.", + "xyz [[t:i:123456]]~le[[/]] [[t:b:123gfv]]la[[/]] pelota.", + "xyz ~le [[t:b:123gfv]]la[[/]] pelota.", + "xyz ejemplo ~o [[t:b:abc123; t:i:123456]]ho[[/]] ~le [[t:b:io1245b]]la[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:h5lVhA]]El[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]perro[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:npAFwg]]amigo[[/]][]", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]~de[[/]] el [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] ~de [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] ~de el [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:abc123; t:i:123456]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:abc123; t:i:123456]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]"] + + expectedOutputs = [ "xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].", + "xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", + "xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] se la [[t:b:iopmnb]]nombre[[/]].", + "xyz ejemplo [[t:i:1235gb; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:i4x56fb]]se la[[/]] nombre.", + "xyz [[t:i:123456; t:b:123gfv]]se la[[/]] pelota.", + "xyz [[t:b:123gfv]]se la[[/]] pelota.", + "xyz ejemplo [[t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:io1245b]]se la[[/]] [[t:b:iopmnb]]nombre[[/]].", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:h5lVhA]]El[[/]] [[t:b:Z9eiLA; t:i:4_tPUA]]perro[[/]] [[t:b:Z9eiLA; t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:npAFwg]]amigo[[/]][]", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:i:4_tPUA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] [[t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:Z9eiLA]]El[[/]] [[t:s:8AjRFw]]perro[[/]] del [[t:i:wSM6RQ]]amigo[[/]][]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:b:abc123; t:i:123456]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] le pe test [[t:b:abc123; t:i:123456]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]"] # These fail on some systems: #from null_flush_invalid_stream_format import *