Skip to content

Commit

Permalink
Wordbound blank handling in post generation (#102)
Browse files Browse the repository at this point in the history
* Wordbound blanks merge when words merge.

* Wordbound blanks apply to all output words when output of postgen rule are more than input words.

* No regression for postgeneration without wordbound blanks.

* Lots of tests added.
  • Loading branch information
khannatanmai committed Jul 22, 2020
1 parent 1230351 commit 3542261
Show file tree
Hide file tree
Showing 4 changed files with 363 additions and 11 deletions.
176 changes: 166 additions & 10 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,61 @@ FSTProcessor::readWblank(FILE *input)
return result;
}

bool
FSTProcessor::wblankPostGen(FILE *input, FILE *output)
{
wstring result = L"";
result += L"[[";
wchar_t c = 0;

while(!feof(input))
{
c = static_cast<wchar_t>(fgetwc_unlocked(input));
result += c;

if(c == L'\\')
{
result += c;
result += static_cast<wchar_t>(fgetwc_unlocked(input));
}
else if(c == L']')
{
c = static_cast<wchar_t>(fgetwc_unlocked(input));
result += c;

if(c == L']')
{
int resultlen = result.size();
if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]]
{
fputws(result.c_str(), output);
break;
}
else
{
c = static_cast<wchar_t>(fgetwc_unlocked(input));
if(c == L'~')
{
wblankqueue.push(result);
return true;
}
else
{
result += c;
}
}
}
}
}

if(c != L']')
{
streamError();
}

return false;
}

int
FSTProcessor::readAnalysis(FILE *input)
{
Expand Down Expand Up @@ -425,7 +480,7 @@ FSTProcessor::readTMAnalysis(FILE *input)
}

int
FSTProcessor::readPostgeneration(FILE *input)
FSTProcessor::readPostgeneration(FILE *input, FILE *output)
{
if(!input_buffer.isEmpty())
{
Expand All @@ -434,6 +489,7 @@ FSTProcessor::readPostgeneration(FILE *input)

wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
int altval = 0;
is_wblank = false;
if(feof(input))
{
return 0;
Expand All @@ -451,17 +507,31 @@ FSTProcessor::readPostgeneration(FILE *input)

if(val == L'[')
{
blankqueue.push(readWblank(input));
if(collect_wblanks)
{
wblankqueue.push(readWblank(input));
is_wblank = true;
return static_cast<int>(L' ');
}
else if(wblankPostGen(input, output))
{
return static_cast<int>(L'~');
}
else
{
is_wblank = true;
return static_cast<int>(L' ');
}
}
else
{
ungetwc_unlocked(val, input);
blankqueue.push(readFullBlock(input, L'[', L']'));

input_buffer.add(static_cast<int>(L' '));
return static_cast<int>(L' ');
}

input_buffer.add(static_cast<int>(L' '));
return static_cast<int>(L' ');

case L'\\':
val = static_cast<wchar_t>(fgetwc_unlocked(input));
if(escaped_chars.find(val) == escaped_chars.end())
Expand Down Expand Up @@ -732,6 +802,59 @@ FSTProcessor::flushBlanks(FILE *output)
}
}

void
FSTProcessor::flushWblanks(FILE *output)
{
while(wblankqueue.size() > 0)
{
fputws_unlocked(wblankqueue.front().c_str(), output);
wblankqueue.pop();
}
}

wstring
FSTProcessor::combineWblanks()
{
wstring final_wblank;
wstring last_wblank = L"";

while(wblankqueue.size() > 0)
{
if(wblankqueue.front().compare(L"[[/]]") == 0)
{
if(final_wblank.empty())
{
final_wblank += L"[[";
}
else if(final_wblank.size() > 2)
{
final_wblank += L"; ";
}

final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]]
last_wblank.clear();
}
else
{
last_wblank = wblankqueue.front();
}
wblankqueue.pop();
}

if(!last_wblank.empty())
{
wblankqueue.push(last_wblank);
}

if(!final_wblank.empty())
{
final_wblank += L"]]";
need_end_wblank = true;
}

return final_wblank;
}

void
FSTProcessor::calcInitial()
{
Expand Down Expand Up @@ -2093,36 +2216,65 @@ FSTProcessor::postgeneration(FILE *input, FILE *output)
}

bool skip_mode = true;
collect_wblanks = false;
need_end_wblank = false;
State current_state = initial_state;
wstring lf = L"";
wstring sf = L"";
int last = 0;
set<wchar_t> empty_escaped_chars;

while(wchar_t val = readPostgeneration(input))
while(wchar_t val = readPostgeneration(input, output))
{
if(val == L'~')
{
skip_mode = false;
collect_wblanks = true;
}

if(skip_mode)
if(is_wblank && skip_mode)
{
//do nothing
}
else if(skip_mode)
{
if(iswspace(val))
{
if(need_end_wblank)
{
fputws_unlocked(L"[[/]]", output);
need_end_wblank = false;
}

printSpace(val, output);
}
else
{
if(!need_end_wblank)
{
flushWblanks(output);
}

if(isEscaped(val))
{
fputwc_unlocked(L'\\', output);
}
fputwc_unlocked(val, output);

if(need_end_wblank)
{
fputws_unlocked(L"[[/]]", output);
need_end_wblank = false;
}
}
}
else
{
if(is_wblank)
{
continue;
}

// test for final states
if(current_state.isFinal(all_finals))
{
Expand Down Expand Up @@ -2199,6 +2351,9 @@ FSTProcessor::postgeneration(FILE *input, FILE *output)
}
else
{
wstring final_wblank = combineWblanks();
fputws_unlocked(final_wblank.c_str(), output);

if(lf == L"")
{
unsigned int mark = sf.size();
Expand Down Expand Up @@ -2239,11 +2394,12 @@ FSTProcessor::postgeneration(FILE *input, FILE *output)
fputwc_unlocked(val, output);
}
}

current_state = initial_state;
lf = L"";
sf = L"";
skip_mode = true;
collect_wblanks = false;
}
}
}
Expand All @@ -2269,7 +2425,7 @@ FSTProcessor::intergeneration(FILE *input, FILE *output)

while (true)
{
wchar_t val = readPostgeneration(input);
wchar_t val = readPostgeneration(input, output);

if (val == L'~')
{
Expand Down Expand Up @@ -2414,7 +2570,7 @@ FSTProcessor::transliteration(FILE *input, FILE *output)
wstring sf = L"";
int last = 0;

while(wchar_t val = readPostgeneration(input))
while(wchar_t val = readPostgeneration(input, output))
{
if(iswpunct(val) || iswspace(val))
{
Expand Down
46 changes: 45 additions & 1 deletion lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ class FSTProcessor
*/
queue<wstring> blankqueue;

/**
* Queue of wordbound blanks, used in reading methods
*/
queue<wstring> wblankqueue;

/**
* Set of characters being considered alphabetics
*/
Expand Down Expand Up @@ -226,6 +231,21 @@ class FSTProcessor
* Output no more than 'N' number of weighted analyses
*/
int maxAnalyses;

/**
* True if a wblank block ([[..]]xyz[[/]]) was just read
*/
bool is_wblank;

/**
* True if skip_mode is false and need to collect wblanks
*/
bool collect_wblanks;

/**
* True if a wblank has been processed for postgen and we need an ending wblank
*/
bool need_end_wblank;

/**
* Output no more than 'N' best weight classes
Expand Down Expand Up @@ -257,6 +277,14 @@ class FSTProcessor
* @param input the stream being read
*/
wstring readWblank(FILE *input);

/**
* Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]]
* @param input the stream being read
* @param output the stream to write on
* @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation
*/
bool wblankPostGen(FILE *input, FILE *output);

/**
* Returns true if the character code is identified as alphabetic
Expand All @@ -282,27 +310,31 @@ class FSTProcessor
/**
* Read text from stream (decomposition version)
* @param input the stream to read
* @param output the stream to write on
* @return the next symbol in the stream
*/
int readDecomposition(FILE *input, FILE *output);

/**
* Read text from stream (postgeneration version)
* @param input the stream to read
* @param output the stream to write on
* @return the next symbol in the stream
*/
int readPostgeneration(FILE *input);
int readPostgeneration(FILE *input, FILE *output);

/**
* Read text from stream (generation version)
* @param input the stream to read
* @param output the stream being written to
* @return the next symbol in the stream
*/
int readGeneration(FILE *input, FILE *output);

/**
* Read text from stream (biltrans version)
* @param input the stream to read
* @param output the stream to write on
* @return the queue of 0-symbols, and the next symbol in the stream
*/
pair<wstring, int> readBilingual(FILE *input, FILE *output);
Expand All @@ -319,6 +351,18 @@ class FSTProcessor
* @param output stream to write blanks
*/
void flushBlanks(FILE *output);

/**
* Flush all the wordbound blanks remaining in the current process
* @param output stream to write blanks
*/
void flushWblanks(FILE *output);

/**
* Combine wordbound blanks in the queue and return them
* @return final wblank string
*/
wstring combineWblanks();

/**
* Calculate the initial state of parsing
Expand Down
Loading

0 comments on commit 3542261

Please sign in to comment.