From 2571d1aac054322a6cc5f14fcb0f87a512114b62 Mon Sep 17 00:00:00 2001
From: Kevin Brubeck Unhammer
Date: Fri, 11 Jun 2021 14:56:08 +0200
Subject: [PATCH] =?UTF-8?q?Fix=20#107=20=E2=80=93=20postgenerator=20fails?=
=?UTF-8?q?=20when=20~=20in=20middle=20of=20wblanked=20word?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There was already support for [[w:text:foo]]~zzy[[/]] but not for
[[w:text:foo]]xy~zzy[[/]]
This change should keep the old feature of moving wblank-start after
the post-generated part of [[w:text:foo]]~zzy[[/]] (so if there was
a rule turning ~z into Z we get Z[[w:text:foo]]zy[[/]]), but if the
wake-up-mark ~ is seen later in the wblank, it stays surrounded, so
[[w:text:foo]]xy~zzy[[/]] turns into [[w:text:foo]]xyZzy[[/]].
+ tests for https://github.com/apertium/lttoolbox/issues/107
---
lttoolbox/fst_processor.cc | 30 +++++++++++++++++++-----------
tests/data/postgen.dix | 2 ++
tests/lt_proc/__init__.py | 8 ++++++--
3 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index a676a6d9..d46ba019 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -280,11 +280,28 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output)
wstring result = L"";
result += L"[[";
wchar_t c = 0;
+ bool in_content = false;
while(!feof(input))
{
c = static_cast(fgetwc_unlocked(input));
- result += c;
+
+ if(in_content && c == L'~')
+ {
+ if(result[result.size()-1] == L']') {
+ // We just saw the end of a wblank, may want to merge
+ wblankqueue.push(result);
+ }
+ else {
+ // wake-up-mark happened some characters into the wblanked word
+ fputws(result.c_str(), output);
+ }
+ return true;
+ }
+ else
+ {
+ result += c;
+ }
if(c == L'\\')
{
@@ -305,16 +322,7 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output)
}
else
{
- c = static_cast(fgetwc_unlocked(input));
- if(c == L'~')
- {
- wblankqueue.push(result);
- return true;
- }
- else
- {
- result += c;
- }
+ in_content = true; // Assumption: No nested wblanks, always balanced
}
}
}
diff --git a/tests/data/postgen.dix b/tests/data/postgen.dix
index 1bea34b0..6f90d270 100644
--- a/tests/data/postgen.dix
+++ b/tests/data/postgen.dix
@@ -92,6 +92,8 @@
+ sssss
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py
index ffc9236b..e46b8ad2 100644
--- a/tests/lt_proc/__init__.py
+++ b/tests/lt_proc/__init__.py
@@ -173,7 +173,9 @@ class PostgenerationWordboundBlankTest(ProcTest):
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
- "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]"]
+ "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
+ "[[t:text:NaNaNa]]pla~ssar[[/]]",
+ "[[t:text:NaNaNa]]pla~sssar[[/]]"]
expectedOutputs = [ "xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].",
"xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].",
@@ -193,7 +195,9 @@ class PostgenerationWordboundBlankTest(ProcTest):
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
- "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]"]
+ "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
+ "[[t:text:NaNaNa]]plassar[[/]]",
+ "[[t:text:NaNaNa]]plassar[[/]]"]
class PostgenerationWordboundBlankEscapingTest(ProcTest):