diff --git a/core/src/core_icu.h b/core/src/core_icu.h index edfc2098024..20b13b04567 100644 --- a/core/src/core_icu.h +++ b/core/src/core_icu.h @@ -33,6 +33,10 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/normalizer2.h" +#include "unicode/uniset.h" +#include "unicode/usetiter.h" +#include "unicode/regex.h" +#include "unicode/utext.h" #include "keyman_core.h" #include "debuglog.h" diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 42953c604e7..8e37011bda4 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -1,5 +1,3 @@ -// TEMP -#define KMN_NO_ICU 0 /* Copyright: © SIL International. Description: This is an implementation of the LDML keyboard spec 3.0. @@ -439,20 +437,14 @@ reorder_group::apply(std::u32string &str) const { } transform_entry::transform_entry(const transform_entry &other) - : fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr), fMapFromStrId(other.fMapFromStrId), + : fFrom(other.fFrom), fTo(other.fTo), fFromPattern(other.fFromPattern), fMapFromStrId(other.fMapFromStrId), fMapToStrId(other.fMapToStrId), fMapFromList(other.fMapFromList), fMapToList(other.fMapToList), normalization_disabled(other.normalization_disabled) { - if (other.fFromPattern) { - // clone pattern - fFromPattern.reset(other.fFromPattern->clone()); - } } transform_entry::transform_entry(const std::u32string &from, const std::u32string &to) - : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList(), normalization_disabled(false) { + : fFrom(from), fTo(to), fFromPattern(from), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList(), normalization_disabled(false) { assert(!fFrom.empty()); - - init(); } transform_entry::transform_entry( @@ -463,7 +455,7 @@ transform_entry::transform_entry( const kmx::kmx_plus &kplus, bool &valid, bool norm_disabled) - : fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo), normalization_disabled(norm_disabled) { + : fFrom(from), fTo(to), fFromPattern(), fMapFromStrId(mapFrom), fMapToStrId(mapTo), normalization_disabled(norm_disabled) { if (!valid) return; // exit early assert(!fFrom.empty()); // TODO-LDML: should not happen? @@ -471,7 +463,12 @@ transform_entry::transform_entry( assert(kplus.strs != nullptr); assert(kplus.vars != nullptr); assert(kplus.elem != nullptr); - if(!init()) { + std::u32string from2 = fFrom; + if (!normalization_disabled) { + // normalize, including markers, for regex + normalize_nfd_markers(from2, regex_sentinel); + } + if (!fFromPattern.init(from2)) { valid = false; } @@ -508,160 +505,17 @@ transform_entry::transform_entry( } } -bool -transform_entry::init() { - if (fFrom.empty()) { - return false; - } - // TODO-LDML: if we have mapFrom, may need to do other processing. - std::u32string from2 = fFrom; - if (!normalization_disabled) { - // normalize, including markers, for regex - normalize_nfd_markers(from2, regex_sentinel); - } - std::u16string patstr = km::core::kmx::u32string_to_u16string(from2); - UErrorCode status = U_ZERO_ERROR; - /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); - // add '$' to match to end - patustr.append(u'$'); // TODO-LDML: may need to escape some markers. Marker #91 will look like a `[` to the pattern - fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); - return (UASSERT_SUCCESS(status)); -} - size_t transform_entry::apply(const std::u32string &input, std::u32string &output) const { - assert(fFromPattern); - // TODO-LDML: Really? can't go from u32 to UnicodeString? - // TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher. - UErrorCode status = U_ZERO_ERROR; - const std::u16string matchstr = km::core::kmx::u32string_to_u16string(input); - icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length()); - // TODO-LDML: create a new Matcher every time. These could be cached and reset. - std::unique_ptr matcher(fFromPattern->matcher(matchustr, status)); - if (!UASSERT_SUCCESS(status)) { - return 0; // TODO-LDML: return error - } - - if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str - return 0; // no match - } - - // TODO-LDML: this is UTF-16 len, not UTF-32 len!! - // TODO-LDML: if we had an underlying UText this would be simpler. - int32_t matchStart = matcher->start(status); - int32_t matchEnd = matcher->end(status); - if (!UASSERT_SUCCESS(status)) { - return 0; // TODO-LDML: return error - } - // extract.. - const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd); - // preflight to UTF-32 to get length - UErrorCode substrStatus = U_ZERO_ERROR; // throwaway status - // we need the UTF-32 matchLen for our return. - auto matchLen = substr.toUTF32(nullptr, 0, substrStatus); - - // should have matched something. - assert(matchLen > 0); - - // now, do the replace. - - /** this is the 'to' or other replacement string.*/ - icu::UnicodeString rustr; - if (fMapFromStrId == 0) { - // Normal case: not a map. - // This replace will apply $1, $2 etc. - // Convert the fTo into u16 TODO-LDML (we could cache this?) - const std::u16string rstr = km::core::kmx::u32string_to_u16string(fTo); - rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); - } else { - // Set map case: mapping from/to - - // we actually need the group(1) string here. - // this is only the content in parenthesis () - icu::UnicodeString group1 = matcher->group(1, status); - if (!UASSERT_SUCCESS(status)) { - // TODO-LDML: could be a malformed from pattern - return 0; // TODO-LDML: return error - } - // now, how long is group1 in UTF-32, hmm? - UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status - auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus); - char32_t *s = new char32_t[group1Len + 1]; - assert(s != nullptr); // TODO-LDML: OOM - // convert - group1.toUTF32((UChar32 *)s, group1Len + 1, status); - if (!UASSERT_SUCCESS(status)) { - return 0; // TODO-LDML: memory issue - } - std::u32string match32(s, group1Len); // taken from just group1 - // clean up buffer - delete [] s; - - // Now we're ready to do the actual mapping. - - // 1., we need to find the index in the source set. - auto matchIndex = findIndexFrom(match32); - assert(matchIndex != -1L); // TODO-LDML: not matching shouldn't happen, the regex wouldn't have matched. - // we already asserted on load that the from and to sets have the same cardinality. - - // 2. get the target string, convert to utf-16 - // we use the same matchIndex that was just found - const std::u16string rstr = km::core::kmx::u32string_to_u16string(fMapToList.at(matchIndex)); - - // 3. update the UnicodeString for replacement - rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); - // and we return to the regular code flow. - } - // here we replace the match output. No normalization, yet. - icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status); - if (!UASSERT_SUCCESS(status)) { - // TODO-LDML: could fail here due to bad input (syntax err) - return 0; - } - // entireOutput includes all of 'input', but modified. Need to substring it. - icu::UnicodeString outu = entireOutput.tempSubString(matchStart); - - // Special case if there's no output, save some allocs - if (outu.length() == 0) { - output.clear(); - } else { - // TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps. - UErrorCode preflightStatus = U_ZERO_ERROR; - // calculate how big the buffer is - auto out32len = outu.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes - // allocate - std::unique_ptr s(new char32_t[out32len + 1]); - assert(s); - if (!s) { - return 0; // TODO-LDML: allocation failed - } - // convert - outu.toUTF32((UChar32 *)(s.get()), out32len + 1, status); - if (!UASSERT_SUCCESS(status)) { - return 0; // TODO-LDML: memory issue - } - output.assign(s.get(), out32len); - // NOW do a marker-safe normalize - if (!normalization_disabled && !normalize_nfd_markers(output)) { + auto result = fFromPattern.apply(input, output, fTo, fMapFromList, fMapToList); + // NOW do a marker-safe normalize + if (result != 0 && !output.empty() && !normalization_disabled) { + if (!normalize_nfd_markers(output)) { DebugLog("normalize_nfd_markers(output) failed"); - return 0; // TODO-LDML: normalization failed. - } - } - return matchLen; -} - -int32_t transform_entry::findIndexFrom(const std::u32string &match) const { - return findIndex(match, fMapFromList); -} - -int32_t transform_entry::findIndex(const std::u32string &match, const std::deque list) { - int32_t index = 0; - for(auto e = list.begin(); e < list.end(); e++, index++) { - if (match == *e) { - return index; + return 0; // TODO-LDML: normalization failed. } } - return -1; // not found + return result; } any_group::any_group(const transform_group &g) : type(any_group_type::transform), transform(g), reorder() { diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 1b56a762468..90dd5f5e43f 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -16,11 +16,7 @@ #include #include "debuglog.h" -#include "core_icu.h" -#include "unicode/uniset.h" -#include "unicode/usetiter.h" -#include "unicode/regex.h" -#include "unicode/utext.h" +#include "util_regex.hpp" namespace km { namespace core { @@ -111,14 +107,12 @@ class transform_entry { private: const std::u32string fFrom; const std::u32string fTo; - std::unique_ptr fFromPattern; + km::core::util::km_regex fFromPattern; const KMX_DWORD fMapFromStrId; const KMX_DWORD fMapToStrId; std::deque fMapFromList; std::deque fMapToList; - /** Internal function to setup pattern string @returns true on success */ - bool init(); bool normalization_disabled; /** @returns the index of the item in the fMapFromList list, or -1 */ int32_t findIndexFrom(const std::u32string &match) const; diff --git a/core/src/meson.build b/core/src/meson.build index a4160842eba..b7b1ae78f5e 100644 --- a/core/src/meson.build +++ b/core/src/meson.build @@ -83,6 +83,7 @@ kmx_files = files( 'km_core_processevent_api.cpp', 'jsonpp.cpp', 'util_normalize.cpp', + 'util_regex.cpp', 'core_icu.cpp', 'ldml/ldml_processor.cpp', 'ldml/ldml_transforms.cpp', diff --git a/core/src/util_regex.cpp b/core/src/util_regex.cpp new file mode 100644 index 00000000000..f6c08019974 --- /dev/null +++ b/core/src/util_regex.cpp @@ -0,0 +1,223 @@ +/* + Copyright: © SIL International. + Description: Core Regex Utilities - abstract out ICU dependencies + Create Date: 5 Jun 2024 + Authors: Steven R. Loomis +*/ + +#include "util_regex.hpp" + +#include "core_icu.h" +#include "kmx/kmx_xstring.h" + + +namespace km { +namespace core { +namespace util { + +/** find the */ +int32_t km_regex::findIndex(const std::u32string &match, const std::deque &list) { + int32_t index = 0; + for(auto e = list.begin(); e < list.end(); e++, index++) { + if (match == *e) { + return index; + } + } + return -1; // not found +} + +km_regex::km_regex() +#if KMN_NO_ICU +#else + : fPattern(nullptr) +#endif +{ + +} + + +km_regex::km_regex(const km_regex& other) +#if KMN_NO_ICU +#else + : fPattern(nullptr) +#endif +{ +#if KMN_NO_ICU + +#else + if (other.fPattern) { + // clone pattern + fPattern.reset(other.fPattern->clone()); + } +#endif +} + +km_regex::km_regex(const std::u32string &pattern) +#if KMN_NO_ICU +#else + : fPattern(nullptr) +#endif +{ + init(pattern); +} + +km_regex::~km_regex() { + +} + +bool km_regex::valid() const { +#if KMN_NO_ICU +#error todo +#else + // valid if fPattern is present. + return !!fPattern; +#endif +} + +bool km_regex::init(const std::u32string &pattern) { +#if KMN_NO_ICU +#error todo +#else + if (pattern.empty()) { + return false; + } + // TODO-LDML: if we have mapFrom, may need to do other processing. + std::u16string patstr = km::core::kmx::u32string_to_u16string(pattern); + UErrorCode status = U_ZERO_ERROR; + /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); + // add '$' to match to end + patustr.append(u'$'); // TODO-LDML: may need to escape some markers. Marker #91 will look like a `[` to the pattern + fPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); + return (UASSERT_SUCCESS(status)); +#endif +} + +size_t km_regex::apply(const std::u32string &input, std::u32string &output, + const std::u32string &to, + const std::deque &fromList, + const std::deque &toList ) const { +#if KMN_NO_ICU +#error TODO +#else + assert(fPattern); + // TODO-LDML: Really? can't go from u32 to UnicodeString? + // TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher. + UErrorCode status = U_ZERO_ERROR; + const std::u16string matchstr = km::core::kmx::u32string_to_u16string(input); + icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length()); + // TODO-LDML: create a new Matcher every time. These could be cached and reset. + std::unique_ptr matcher(fPattern->matcher(matchustr, status)); + if (!UASSERT_SUCCESS(status)) { + return 0; // TODO-LDML: return error + } + + if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str + return 0; // no match + } + + // TODO-LDML: this is UTF-16 len, not UTF-32 len!! + // TODO-LDML: if we had an underlying UText this would be simpler. + int32_t matchStart = matcher->start(status); + int32_t matchEnd = matcher->end(status); + if (!UASSERT_SUCCESS(status)) { + return 0; // TODO-LDML: return error + } + // extract.. + const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd); + // preflight to UTF-32 to get length + UErrorCode substrStatus = U_ZERO_ERROR; // throwaway status + // we need the UTF-32 matchLen for our return. + auto matchLen = substr.toUTF32(nullptr, 0, substrStatus); + + // should have matched something. + assert(matchLen > 0); + + + // now, do the replace. + + /** this is the 'to' or other replacement string.*/ + icu::UnicodeString rustr; + if (fromList.empty()) { + // Normal case: not a map. + // This replace will apply $1, $2 etc. + // Convert the fTo into u16 TODO-LDML (we could cache this?) + const std::u16string rstr = km::core::kmx::u32string_to_u16string(to); + rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); + } else { + // Set map case: mapping from/to + + // we actually need the group(1) string here. + // this is only the content in parenthesis () + icu::UnicodeString group1 = matcher->group(1, status); + if (!UASSERT_SUCCESS(status)) { + // TODO-LDML: could be a malformed from pattern + return 0; // TODO-LDML: return error + } + // now, how long is group1 in UTF-32, hmm? + UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status + auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus); + char32_t *s = new char32_t[group1Len + 1]; + assert(s != nullptr); // TODO-LDML: OOM + // convert + group1.toUTF32((UChar32 *)s, group1Len + 1, status); + if (!UASSERT_SUCCESS(status)) { + return 0; // TODO-LDML: memory issue + } + std::u32string match32(s, group1Len); // taken from just group1 + // clean up buffer + delete [] s; + + // Now we're ready to do the actual mapping. + + // 1., we need to find the index in the source set. + auto matchIndex = findIndex(match32, fromList); + assert(matchIndex != -1L); // TODO-LDML: not matching shouldn't happen, the regex wouldn't have matched. + // we already asserted on load that the from and to sets have the same cardinality. + + // 2. get the target string, convert to utf-16 + // we use the same matchIndex that was just found + const std::u16string rstr = km::core::kmx::u32string_to_u16string(toList.at(matchIndex)); + + // 3. update the UnicodeString for replacement + rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length()); + // and we return to the regular code flow. + } + // here we replace the match output. No normalization, yet. + icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status); + if (!UASSERT_SUCCESS(status)) { + // TODO-LDML: could fail here due to bad input (syntax err) + return 0; + } + // entireOutput includes all of 'input', but modified. Need to substring it. + icu::UnicodeString outu = entireOutput.tempSubString(matchStart); + + // Special case if there's no output, save some allocs + if (outu.length() == 0) { + output.clear(); + } else { + // TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps. + UErrorCode preflightStatus = U_ZERO_ERROR; + // calculate how big the buffer is + auto out32len = outu.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes + // allocate + std::unique_ptr s(new char32_t[out32len + 1]); + assert(s); + if (!s) { + return 0; // TODO-LDML: allocation failed + } + // convert + outu.toUTF32((UChar32 *)(s.get()), out32len + 1, status); + if (!UASSERT_SUCCESS(status)) { + return 0; // TODO-LDML: memory issue + } + output.assign(s.get(), out32len); + } + return matchLen; + +#endif +} + + +} +} +} diff --git a/core/src/util_regex.hpp b/core/src/util_regex.hpp new file mode 100644 index 00000000000..8d155815b78 --- /dev/null +++ b/core/src/util_regex.hpp @@ -0,0 +1,48 @@ +/* + Copyright: © SIL International. + Description: Normalization and Regex utilities + Create Date: 23 May 2024 + Authors: Steven R. Loomis +*/ + +#pragma once + +#include "core_icu.h" +#include "keyman_core.h" +#include +#include + +namespace km { +namespace core { +namespace util { + +class km_regex { +public: + km_regex(); + km_regex(const km_regex &other); + km_regex(const std::u32string &pattern); + ~km_regex(); + bool init(const std::u32string &pattern); + + size_t apply( + const std::u32string &input, + std::u32string &output, + const std::u32string &to, + const std::deque &fromList, + const std::deque &toList) const; + + bool valid() const; +private: +#if KMN_NO_ICU + void *stuff; +#else + std::unique_ptr fPattern; +#endif +// utility functions + public: + static int32_t findIndex(const std::u32string &match, const std::deque &list); +}; + +} // namespace util +} // namespace core +} // namespace km diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index 94c53f38452..bf95ba31156 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -1,5 +1,6 @@ #include "../../../src/ldml/ldml_markers.hpp" #include "../../../src/ldml/ldml_transforms.hpp" +#include "../../../src/util_regex.hpp" #include "kmx/kmx_plus.h" #include "kmx/kmx_xstring.h" #include "test_color.h" @@ -624,16 +625,16 @@ test_map() { std::cout << __FILE__ << ":" << __LINE__ << " transform_entry::findIndex" << std::endl; { std::deque list; - assert_equal(transform_entry::findIndex(U"Does Not Exist", list), -1); + assert_equal(km::core::util::km_regex::findIndex(U"Does Not Exist", list), -1); list.emplace_back(U"0th"); list.emplace_back(U"First"); list.emplace_back(U"Second"); - assert_equal(transform_entry::findIndex(U"First", list), 1); - assert_equal(transform_entry::findIndex(U"0th", list), 0); - assert_equal(transform_entry::findIndex(U"Second", list), 2); - assert_equal(transform_entry::findIndex(U"Nowhere", list), -1); + assert_equal(km::core::util::km_regex::findIndex(U"First", list), 1); + assert_equal(km::core::util::km_regex::findIndex(U"0th", list), 0); + assert_equal(km::core::util::km_regex::findIndex(U"Second", list), 2); + assert_equal(km::core::util::km_regex::findIndex(U"Nowhere", list), -1); } return EXIT_SUCCESS;