Skip to content

Commit

Permalink
Fix #129 – don't drop 32-bit chars down to 16-bit in postgen
Browse files Browse the repository at this point in the history
  • Loading branch information
unhammer committed Feb 23, 2022
1 parent 76623b6 commit fcc9f0f
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
10 changes: 5 additions & 5 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ FSTProcessor::readTMAnalysis(InputFile& input)
return val;
}

int
int32_t
FSTProcessor::readPostgeneration(InputFile& input, UFILE *output)
{
if(!input_buffer.isEmpty())
Expand Down Expand Up @@ -903,7 +903,7 @@ FSTProcessor::lastBlank(UString const &str)
}

void
FSTProcessor::printSpace(UChar const val, UFILE *output)
FSTProcessor::printSpace(UChar32 const val, UFILE *output)
{
if(blankqueue.size() > 0)
{
Expand Down Expand Up @@ -1803,7 +1803,7 @@ FSTProcessor::postgeneration(InputFile& input, UFILE *output)
int last = 0;
set<UChar32> empty_escaped_chars;

while(UChar val = readPostgeneration(input, output))
while(UChar32 val = readPostgeneration(input, output))
{
if(val == '~')
{
Expand Down Expand Up @@ -2027,7 +2027,7 @@ FSTProcessor::intergeneration(InputFile& input, UFILE *output)

while (true)
{
UChar val = readPostgeneration(input, output);
UChar32 val = readPostgeneration(input, output);

if (val == '~')
{
Expand Down Expand Up @@ -2165,7 +2165,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
UString sf;
int last = 0;

while(UChar val = readPostgeneration(input, output))
while(UChar32 val = readPostgeneration(input, output))
{
if(u_ispunct(val) || u_isspace(val))
{
Expand Down
2 changes: 1 addition & 1 deletion lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ class FSTProcessor
* @param val the space character to use if no blank queue
* @param output stream where the word is written
*/
void printSpace(UChar const val, UFILE *output);
void printSpace(UChar32 const val, UFILE *output);

void skipUntil(InputFile& input, UFILE *output, UChar32 const character);
static UString removeTags(UString const &str);
Expand Down
7 changes: 7 additions & 0 deletions tests/lt_proc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,5 +254,12 @@ class AlphabeticMultibyteTest(ProcTest):
expectedOutputs = ["^𝜊/*𝜊$"]


class AlphabeticMultibyteTestPost(ProcTest):
procdix = "data/minimal-mono.dix"
inputs = ["𝜊"] # code point >65535, needs two bytes in utf-8, isAlphabetic
procflags = ['-z', '-p']
expectedOutputs = ["𝜊"]


# These fail on some systems:
#from null_flush_invalid_stream_format import *

0 comments on commit fcc9f0f

Please sign in to comment.