Skip to content

Commit

Permalink
Fix #107 – postgenerator fails when ~ in middle of wblanked word
Browse files Browse the repository at this point in the history
There was already support for [[w:text:foo]]~zzy[[/]] but not for
[[w:text:foo]]xy~zzy[[/]]

This change should keep the old feature of moving wblank-start after
the post-generated part of [[w:text:foo]]~zzy[[/]] (so if there was
a rule turning ~z into Z we get Z[[w:text:foo]]zy[[/]]), but if the
wake-up-mark ~ is seen later in the wblank, it stays surrounded, so
[[w:text:foo]]xy~zzy[[/]] turns into [[w:text:foo]]xyZzy[[/]].

+ tests for #107
+ necessary changes for combineWblanks (can now get called when no
  opening wblanks in queue)
  • Loading branch information
unhammer committed Jun 11, 2021
1 parent 51b0651 commit bd3837f
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 22 deletions.
54 changes: 34 additions & 20 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,28 @@ FSTProcessor::wblankPostGen(InputFile& input, UFILE *output)
{
UString result = WBLANK_START;
UChar32 c = 0;
bool in_content = false;

while(!input.eof())
{
c = input.get();
result += c;

if(in_content && c == '~')
{
if(result[result.size()-1]==']') {
// We just saw the end of a wblank, may want to merge
wblankqueue.push(result);
}
else {
// wake-up-mark happened some characters into the wblanked word
write(result, output);
}
return true;
}
else
{
result += c;
}

if(c == '\\')
{
Expand All @@ -309,16 +326,7 @@ FSTProcessor::wblankPostGen(InputFile& input, UFILE *output)
}
else
{
c = input.get();
if(c == '~')
{
wblankqueue.push(result);
return true;
}
else
{
result += c;
}
in_content = true; // Assumption: No nested wblanks, always balanced
}
}
}
Expand Down Expand Up @@ -788,25 +796,32 @@ FSTProcessor::combineWblanks()
{
UString final_wblank;
UString last_wblank;
bool seen_wblank = false;

while(wblankqueue.size() > 0)
{
if(wblankqueue.front().compare(WBLANK_FINAL) == 0)
{
if(final_wblank.empty())
{
final_wblank += WBLANK_START;
if(seen_wblank) {
if(final_wblank.empty())
{
final_wblank += WBLANK_START;
}
else if(final_wblank.size() > 2)
{
final_wblank += "; "_u;
}

final_wblank.append(last_wblank, 2, last_wblank.size()-4); //add wblank without brackets [[..]]
}
else if(final_wblank.size() > 2)
{
final_wblank += "; "_u;
else {
need_end_wblank = true;
}

final_wblank.append(last_wblank, 2, last_wblank.size()-4); //add wblank without brackets [[..]]
last_wblank.clear();
}
else
{
seen_wblank = true;
last_wblank = wblankqueue.front();
}
wblankqueue.pop();
Expand All @@ -822,7 +837,6 @@ FSTProcessor::combineWblanks()
final_wblank += WBLANK_END;
need_end_wblank = true;
}

return final_wblank;
}

Expand Down
8 changes: 8 additions & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,14 @@ class FSTProcessor

/**
* Combine wordbound blanks in the queue and return them
*
* May pop from 'wblankqueue' and set 'need_end_wblank' to true.
*
* If 'wblankqueue' (see which) is empty, we get an empty string,
* otherwise we return a semicolon-separated combination of opening
* wblanks in the queue. If there is only a closing wblank, we just
* set need_end_wblank.
*
* @return final wblank string
*/
UString combineWblanks();
Expand Down
2 changes: 2 additions & 0 deletions tests/data/postgen.dix
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@
</p>
</e>

<e><p><l><a/>sss</l><r>ss</r></p></e>

</section>

</dictionary>
Expand Down
12 changes: 10 additions & 2 deletions tests/lt_proc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ class PostgenerationWordboundBlankTest(ProcTest):
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]"]
"[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:text:NaNaNa]]pla~ss[[/]]",
"[[t:text:NaNaNa]]pla~sss[[/]]",
"[[t:text:NaNaNa]]pla~ssar[[/]]",
"[[t:text:NaNaNa]]pla~sssar[[/]]"]

expectedOutputs = [ "xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].",
"xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].",
Expand All @@ -193,7 +197,11 @@ class PostgenerationWordboundBlankTest(ProcTest):
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]"]
"[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]",
"[[t:text:NaNaNa]]plass[[/]]",
"[[t:text:NaNaNa]]plass[[/]]",
"[[t:text:NaNaNa]]plassar[[/]]",
"[[t:text:NaNaNa]]plassar[[/]]"]


class PostgenerationWordboundBlankEscapingTest(ProcTest):
Expand Down

0 comments on commit bd3837f

Please sign in to comment.