From 2571d1aac054322a6cc5f14fcb0f87a512114b62 Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Fri, 11 Jun 2021 14:56:08 +0200 Subject: [PATCH] =?UTF-8?q?Fix=20#107=20=E2=80=93=20postgenerator=20fails?= =?UTF-8?q?=20when=20~=20in=20middle=20of=20wblanked=20word?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was already support for [[w:text:foo]]~zzy[[/]] but not for [[w:text:foo]]xy~zzy[[/]] This change should keep the old feature of moving wblank-start after the post-generated part of [[w:text:foo]]~zzy[[/]] (so if there was a rule turning ~z into Z we get Z[[w:text:foo]]zy[[/]]), but if the wake-up-mark ~ is seen later in the wblank, it stays surrounded, so [[w:text:foo]]xy~zzy[[/]] turns into [[w:text:foo]]xyZzy[[/]]. + tests for https://github.com/apertium/lttoolbox/issues/107 --- lttoolbox/fst_processor.cc | 30 +++++++++++++++++++----------- tests/data/postgen.dix | 2 ++ tests/lt_proc/__init__.py | 8 ++++++-- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index a676a6d9..d46ba019 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -280,11 +280,28 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) wstring result = L""; result += L"[["; wchar_t c = 0; + bool in_content = false; while(!feof(input)) { c = static_cast(fgetwc_unlocked(input)); - result += c; + + if(in_content && c == L'~') + { + if(result[result.size()-1] == L']') { + // We just saw the end of a wblank, may want to merge + wblankqueue.push(result); + } + else { + // wake-up-mark happened some characters into the wblanked word + fputws(result.c_str(), output); + } + return true; + } + else + { + result += c; + } if(c == L'\\') { @@ -305,16 +322,7 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) } else { - c = static_cast(fgetwc_unlocked(input)); - if(c == L'~') - { - wblankqueue.push(result); - return true; - } - else - { - result += c; - } + in_content = true; // Assumption: No nested wblanks, always balanced } } } diff --git a/tests/data/postgen.dix b/tests/data/postgen.dix index 1bea34b0..6f90d270 100644 --- a/tests/data/postgen.dix +++ b/tests/data/postgen.dix @@ -92,6 +92,8 @@

+

sssss

+ diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index ffc9236b..e46b8ad2 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -173,7 +173,9 @@ class PostgenerationWordboundBlankTest(ProcTest): "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]"] + "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:text:NaNaNa]]pla~ssar[[/]]", + "[[t:text:NaNaNa]]pla~sssar[[/]]"] expectedOutputs = [ "xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", "xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].", @@ -193,7 +195,9 @@ class PostgenerationWordboundBlankTest(ProcTest): "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]"] + "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:text:NaNaNa]]plassar[[/]]", + "[[t:text:NaNaNa]]plassar[[/]]"] class PostgenerationWordboundBlankEscapingTest(ProcTest):