diff --git a/src/FoLiA-2text.cxx b/src/FoLiA-2text.cxx index 563bc55..3fbc9d6 100644 --- a/src/FoLiA-2text.cxx +++ b/src/FoLiA-2text.cxx @@ -48,7 +48,9 @@ void usage( const string& name ){ cerr << "\t FoLiA-2text will produce a text from a FoLiA file, " << endl; cerr << "\t or a whole directory of FoLiA files " << endl; cerr << "\t-c OR --class='name'\t use 'name' as the folia class for nodes. (default is 'current')" << endl; - cerr << "\t--retaintok\t retain tokenization. Default is attempt to remove." << endl; + cerr << "\t--retaintok\t Retain tokenization. Default is attempt to remove." << endl; + cerr << "\t--restore-formatting \tAttempt to restore the original formatting." << endl; + cerr << "\t\t\t Will insert (soft-)hypens and such." << endl; cerr << "\t-t 'threads' or\n\t--threads='threads' Number of threads to run on." << endl; cerr << "\t\t\t If 'threads' has the value \"max\", the number of threads is set to a" << endl; cerr << "\t\t\t reasonable value. (OMP_NUM_TREADS - 2)" << endl; @@ -70,7 +72,7 @@ UnicodeString handle_token_tag( const folia::FoliaElement *d, int main( int argc, char *argv[] ){ TiCC::CL_Options opts( "hVvpe:t:o:c:", "class:,help,version,retaintok,threads:," - // "hyphens," + "restore-formatting," "honour-tags,correction-handling:" ); try { opts.init(argc,argv); @@ -99,7 +101,7 @@ int main( int argc, char *argv[] ){ } opts.extract( 'o', outputPrefix ); bool retaintok = opts.extract( "retaintok" ); - // bool add_hyphens = opts.extract( "hyphens" ); + bool restore = opts.extract( "restore-formatting" ); bool honour_tags = opts.extract( "honour-tags" ); CORRECTION_HANDLING ch = CORRECTION_HANDLING::CURRENT; string handling; @@ -210,9 +212,9 @@ int main( int argc, char *argv[] ){ if ( retaintok ){ tp.set( folia::TEXT_FLAGS::RETAIN ); } - // if ( add_hyphens ){ - // tp.set( folia::TEXT_FLAGS::ADD_FORMATTING ); - // } + if ( restore ){ + tp.set( folia::TEXT_FLAGS::ADD_FORMATTING ); + } tp.set_correction_handling( ch ); tp.set_debug( verbosity > 0 ); if ( honour_tags ){ diff --git a/src/FoLiA-abby.cxx b/src/FoLiA-abby.cxx index 85cc9b3..b5960ca 100644 --- a/src/FoLiA-abby.cxx +++ b/src/FoLiA-abby.cxx @@ -362,14 +362,13 @@ void update_formatting_info( formatting_info& line_font, struct line_info { line_info(): - _line(0), - _spaces(0) + _line(0) {}; UnicodeString _value; formatting_info _fi; xmlNode *_line; UnicodeString _hyph; - int _spaces; + UnicodeString _spaces; }; void process_line( xmlNode *block, @@ -410,7 +409,7 @@ void process_line( xmlNode *block, UnicodeString tmp = uresult; tmp.trim(); if ( tmp.isEmpty() ){ - li._spaces = uresult.length(); + li._spaces = uresult; } line_parts.push_back( li ); } @@ -499,14 +498,19 @@ folia::TextMarkupStyle* make_style_content( const formatting_info& info, return content; } -void add_hspace( folia::FoliaElement *content ){ +void add_hspace( folia::FoliaElement *content, + const UnicodeString& value ){ //! insert a node to a FoliaElement /*! \param content the node to connect to */ folia::KWargs args; args["class"] = "space"; - content->add_child( args ); + folia::FoliaElement *hs = content->add_child( args ); + if ( !value.isEmpty() ){ + folia::XmlText *te = hs->add_child(); + te->setvalue( TiCC::UnicodeToUTF8(value) ); + } } void add_value( folia::FoliaElement *content, @@ -516,20 +520,38 @@ void add_value( folia::FoliaElement *content, \param content the Folia to extend \param value the Unicode string to add this fuction will replace leading and trailing spaces by nodes - */ + */ if ( !value.isEmpty() ){ - bool begin_space = u_isspace( value[0] ); - bool end_space = u_isspace( value[value.length()-1] ); + UnicodeString start_spaces; + for ( int i=0;i < value.length(); ++i ){ + if ( u_isspace(value[i] ) ){ + start_spaces += value[i]; + } + else { + break; + } + } + UnicodeString end_spaces; + for ( int i=value.length()-1; i>0; --i ){ + if ( u_isspace(value[i] ) ){ + end_spaces = value[i] + end_spaces; + } + else { + break; + } + } + bool begin_space = !start_spaces.isEmpty(); + bool end_space = !end_spaces.isEmpty(); UnicodeString out = value; out.trim(); if ( begin_space ){ // represent ALL leading spaces as 1 TextMarkupHSpace - add_hspace( content ); + add_hspace( content, start_spaces ); } content->add_child( TiCC::UnicodeToUTF8(out) ); if ( end_space ){ // represent ALL trailing spaces as 1 TextMarkupHSpace - add_hspace( content ); + add_hspace( content, end_spaces ); } } } @@ -643,21 +665,21 @@ bool process_paragraph( folia::Paragraph *paragraph, else { // a 'true' hyphen: add the value + // cerr << "HYPH= '" << it._hyph << "'" << endl; - folia::KWargs args; + add_value( content, value ); + folia::Hyphbreak *hb = content->add_child(); if ( it._hyph == "¬" || ( it._hyph == "-" && &it == &line_parts.back() ) ){ - args["class"] = TiCC::UnicodeToUTF8(it._hyph); + folia::XmlText *e = hb->add_child(); + e->setvalue( TiCC::UnicodeToUTF8(it._hyph) ); previous_hyphen = true; } - add_value( content, value ); - content->add_child(args); // cerr << "content now: " << content << endl; no_break = true; } } - else if ( it._spaces > 0 ){ - add_hspace( content ); + else if ( !it._spaces.isEmpty() ){ + add_hspace( content, it._spaces ); } else { add_value( content, value ); diff --git a/src/FoLiA-txt.cxx b/src/FoLiA-txt.cxx index 6ee7263..4b49aa5 100644 --- a/src/FoLiA-txt.cxx +++ b/src/FoLiA-txt.cxx @@ -282,9 +282,9 @@ int main( int argc, char *argv[] ){ bool add_space = true; if ( !hyp.isEmpty() ){ // add an extra HyphBreak to the stack - folia::KWargs args; - args["class"] = TiCC::UnicodeToUTF8(hyp); - FoliaElement *hb = new folia::Hyphbreak(args,d); + FoliaElement *hb = new folia::Hyphbreak(); + XmlText *e = hb->add_child(); // create partial text + e->setvalue( TiCC::UnicodeToUTF8(hyp) ); par_stack.push_back( hb ); add_space = false; }