From f538b60d31049ec66199dc3c1c6c214f65c7ee9f Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 10 Dec 2020 19:40:12 +0100 Subject: [PATCH] working on a fix for problems with leading/trailing whitespace (proycon/folia#88) --- include/libfolia/folia_impl.h | 4 +++ src/folia_impl.cxx | 58 ++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/include/libfolia/folia_impl.h b/include/libfolia/folia_impl.h index adf4f143..dc078411 100644 --- a/include/libfolia/folia_impl.h +++ b/include/libfolia/folia_impl.h @@ -2929,6 +2929,10 @@ namespace folia { std::string VersionName(); std::string Version(); + UnicodeString trim_space( const UnicodeString& in ); + UnicodeString ltrim( const UnicodeString& in ); + UnicodeString rtrim( const UnicodeString& in ); + } // namespace folia #endif diff --git a/src/folia_impl.cxx b/src/folia_impl.cxx index 586baf64..b9cc8dbd 100644 --- a/src/folia_impl.cxx +++ b/src/folia_impl.cxx @@ -1581,14 +1581,26 @@ namespace folia { return ""; } UnicodeString result; + int i = 0; for ( const auto& d : _data ){ - if ( d->printable() ){ + if (d->isinstance( XmlText_t)) { + if ((i == 0) && (i == (int) _data.size() -1)) { + result += rtrim(ltrim(d->text( cls ))); + } else if (i == 0) { + result += ltrim(d->text( cls )); + } else if (i == (int) _data.size() - 1) { + result += rtrim(d->text( cls )); + } else { + result += d->text( cls ); + } + } else if ( d->printable() ){ if ( !result.isEmpty() ){ const string& delim = d->get_delimiter( retaintok ); result += TiCC::UnicodeFromUTF8(delim); } - result += d->text( cls ); + result += d->text( cls ); } + i++; } #ifdef DEBUG_TEXT cerr << "TEXT(" << cls << ") on a textcontainer :" << xmltag() @@ -1751,20 +1763,20 @@ namespace folia { * \return an UnicodeString with all leading and trailing spaces removed. * Other 'whitespace' characters like newline and tab are retained! */ - UnicodeString cmp = " "; + const char16_t space = 0x0020; // cerr << "in = '" << in << "'" << endl; UnicodeString out; int i = 0; for( ; i < in.length(); ++i ){ // cerr << "start: bekijk:" << UnicodeString(in[i]) << endl; - if ( in[i] != cmp[0] ){ + if ( in[i] != space ){ break; } } int j = in.length()-1; for( ; j >= 0; --j ){ // cerr << "end: bekijk:" << UnicodeString(in[j]) << endl; - if ( in[j] != cmp[0] ){ + if ( in[j] != space ){ break; } } @@ -1779,6 +1791,42 @@ namespace folia { return out; } + UnicodeString ltrim( const UnicodeString& in ){ + /// remove leading whitespace (including newlines and tabs) + int begin = in.length(); + for (int i = 0; i < in.length(); i++) { + if ((in[i] != 0x0020) && (in[i] != 0x0009) && (in[i] != 0x000a) && (in[i] != 0x000d)) { + begin = i; + break; + } + } + if (begin == 0) { + return in; + } else if (begin == in.length()) { + return ""; + } else { + return UnicodeString(in, begin, in.length() - begin); + } + } + + UnicodeString rtrim( const UnicodeString& in ){ + /// remove trailing whitespace (including newlines and tabs) + int end = -1; + for (int i = in.length() - 1; i >= 0; i--) { + if ((in[i] != 0x0020) && (in[i] != 0x0009) && (in[i] != 0x000a) && (in[i] != 0x000d)) { + end = i; + break; + } + } + if (end == in.length()) { + return in; + } else if (end == -1) { + return ""; + } else { + return UnicodeString(in, 0, end+1); + } + } + bool check_end( const UnicodeString& us, bool& only ){ /// check for newline characters at the end /*!