Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wordbound blank handling in post generation #102

Merged
merged 5 commits into from
Jul 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 166 additions & 10 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,61 @@ FSTProcessor::readWblank(FILE *input)
return result;
}

bool
FSTProcessor::wblankPostGen(FILE *input, FILE *output)
{
wstring result = L"";
result += L"[[";
wchar_t c = 0;

while(!feof(input))
{
c = static_cast<wchar_t>(fgetwc_unlocked(input));
result += c;

if(c == L'\\')
{
result += c;
result += static_cast<wchar_t>(fgetwc_unlocked(input));
}
else if(c == L']')
{
c = static_cast<wchar_t>(fgetwc_unlocked(input));
result += c;

if(c == L']')
{
int resultlen = result.size();
if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]]
{
fputws(result.c_str(), output);
break;
}
else
{
c = static_cast<wchar_t>(fgetwc_unlocked(input));
if(c == L'~')
{
wblankqueue.push(result);
return true;
}
else
{
result += c;
}
}
}
}
}

if(c != L']')
{
streamError();
}

return false;
}

int
FSTProcessor::readAnalysis(FILE *input)
{
Expand Down Expand Up @@ -425,7 +480,7 @@ FSTProcessor::readTMAnalysis(FILE *input)
}

int
FSTProcessor::readPostgeneration(FILE *input)
FSTProcessor::readPostgeneration(FILE *input, FILE *output)
{
if(!input_buffer.isEmpty())
{
Expand All @@ -434,6 +489,7 @@ FSTProcessor::readPostgeneration(FILE *input)

wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
int altval = 0;
is_wblank = false;
if(feof(input))
{
return 0;
Expand All @@ -451,17 +507,31 @@ FSTProcessor::readPostgeneration(FILE *input)

if(val == L'[')
{
blankqueue.push(readWblank(input));
if(collect_wblanks)
{
wblankqueue.push(readWblank(input));
is_wblank = true;
return static_cast<int>(L' ');
}
else if(wblankPostGen(input, output))
{
return static_cast<int>(L'~');
}
else
{
is_wblank = true;
return static_cast<int>(L' ');
}
}
else
{
ungetwc_unlocked(val, input);
blankqueue.push(readFullBlock(input, L'[', L']'));

input_buffer.add(static_cast<int>(L' '));
return static_cast<int>(L' ');
}

input_buffer.add(static_cast<int>(L' '));
return static_cast<int>(L' ');

case L'\\':
val = static_cast<wchar_t>(fgetwc_unlocked(input));
if(escaped_chars.find(val) == escaped_chars.end())
Expand Down Expand Up @@ -732,6 +802,59 @@ FSTProcessor::flushBlanks(FILE *output)
}
}

void
FSTProcessor::flushWblanks(FILE *output)
{
while(wblankqueue.size() > 0)
{
fputws_unlocked(wblankqueue.front().c_str(), output);
wblankqueue.pop();
}
}

wstring
FSTProcessor::combineWblanks()
{
wstring final_wblank;
wstring last_wblank = L"";

while(wblankqueue.size() > 0)
{
if(wblankqueue.front().compare(L"[[/]]") == 0)
{
if(final_wblank.empty())
{
final_wblank += L"[[";
}
else if(final_wblank.size() > 2)
{
final_wblank += L"; ";
}

final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]]
last_wblank.clear();
}
else
{
last_wblank = wblankqueue.front();
}
wblankqueue.pop();
}

if(!last_wblank.empty())
{
wblankqueue.push(last_wblank);
}

if(!final_wblank.empty())
{
final_wblank += L"]]";
need_end_wblank = true;
}

return final_wblank;
}

void
FSTProcessor::calcInitial()
{
Expand Down Expand Up @@ -2093,36 +2216,65 @@ FSTProcessor::postgeneration(FILE *input, FILE *output)
}

bool skip_mode = true;
collect_wblanks = false;
need_end_wblank = false;
State current_state = initial_state;
wstring lf = L"";
wstring sf = L"";
int last = 0;
set<wchar_t> empty_escaped_chars;

while(wchar_t val = readPostgeneration(input))
while(wchar_t val = readPostgeneration(input, output))
{
if(val == L'~')
{
skip_mode = false;
collect_wblanks = true;
}

if(skip_mode)
if(is_wblank && skip_mode)
{
//do nothing
}
else if(skip_mode)
{
if(iswspace(val))
{
if(need_end_wblank)
{
fputws_unlocked(L"[[/]]", output);
need_end_wblank = false;
}

printSpace(val, output);
}
else
{
if(!need_end_wblank)
{
flushWblanks(output);
}

if(isEscaped(val))
{
fputwc_unlocked(L'\\', output);
}
fputwc_unlocked(val, output);

if(need_end_wblank)
{
fputws_unlocked(L"[[/]]", output);
need_end_wblank = false;
}
}
}
else
{
if(is_wblank)
{
continue;
}

// test for final states
if(current_state.isFinal(all_finals))
{
Expand Down Expand Up @@ -2199,6 +2351,9 @@ FSTProcessor::postgeneration(FILE *input, FILE *output)
}
else
{
wstring final_wblank = combineWblanks();
fputws_unlocked(final_wblank.c_str(), output);

if(lf == L"")
{
unsigned int mark = sf.size();
Expand Down Expand Up @@ -2239,11 +2394,12 @@ FSTProcessor::postgeneration(FILE *input, FILE *output)
fputwc_unlocked(val, output);
}
}

current_state = initial_state;
lf = L"";
sf = L"";
skip_mode = true;
collect_wblanks = false;
}
}
}
Expand All @@ -2269,7 +2425,7 @@ FSTProcessor::intergeneration(FILE *input, FILE *output)

while (true)
{
wchar_t val = readPostgeneration(input);
wchar_t val = readPostgeneration(input, output);

if (val == L'~')
{
Expand Down Expand Up @@ -2414,7 +2570,7 @@ FSTProcessor::transliteration(FILE *input, FILE *output)
wstring sf = L"";
int last = 0;

while(wchar_t val = readPostgeneration(input))
while(wchar_t val = readPostgeneration(input, output))
{
if(iswpunct(val) || iswspace(val))
{
Expand Down
46 changes: 45 additions & 1 deletion lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ class FSTProcessor
*/
queue<wstring> blankqueue;

/**
* Queue of wordbound blanks, used in reading methods
*/
queue<wstring> wblankqueue;

/**
* Set of characters being considered alphabetics
*/
Expand Down Expand Up @@ -226,6 +231,21 @@ class FSTProcessor
* Output no more than 'N' number of weighted analyses
*/
int maxAnalyses;

/**
* True if a wblank block ([[..]]xyz[[/]]) was just read
*/
bool is_wblank;

/**
* True if skip_mode is false and need to collect wblanks
*/
bool collect_wblanks;

/**
* True if a wblank has been processed for postgen and we need an ending wblank
*/
bool need_end_wblank;

/**
* Output no more than 'N' best weight classes
Expand Down Expand Up @@ -257,6 +277,14 @@ class FSTProcessor
* @param input the stream being read
*/
wstring readWblank(FILE *input);

/**
* Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]]
* @param input the stream being read
* @param output the stream to write on
* @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation
*/
bool wblankPostGen(FILE *input, FILE *output);

/**
* Returns true if the character code is identified as alphabetic
Expand All @@ -282,27 +310,31 @@ class FSTProcessor
/**
* Read text from stream (decomposition version)
* @param input the stream to read
* @param output the stream to write on
* @return the next symbol in the stream
*/
int readDecomposition(FILE *input, FILE *output);

/**
* Read text from stream (postgeneration version)
* @param input the stream to read
* @param output the stream to write on
* @return the next symbol in the stream
*/
int readPostgeneration(FILE *input);
int readPostgeneration(FILE *input, FILE *output);

/**
* Read text from stream (generation version)
* @param input the stream to read
* @param output the stream being written to
* @return the next symbol in the stream
*/
int readGeneration(FILE *input, FILE *output);

/**
* Read text from stream (biltrans version)
* @param input the stream to read
* @param output the stream to write on
* @return the queue of 0-symbols, and the next symbol in the stream
*/
pair<wstring, int> readBilingual(FILE *input, FILE *output);
Expand All @@ -319,6 +351,18 @@ class FSTProcessor
* @param output stream to write blanks
*/
void flushBlanks(FILE *output);

/**
* Flush all the wordbound blanks remaining in the current process
* @param output stream to write blanks
*/
void flushWblanks(FILE *output);

/**
* Combine wordbound blanks in the queue and return them
* @return final wblank string
*/
wstring combineWblanks();

/**
* Calculate the initial state of parsing
Expand Down
Loading