Skip to content

Commit

Permalink
refactor(core): devolve regex to js for wasm
Browse files Browse the repository at this point in the history
- new module, core/src/util_regex.hpp
- no wasm implementationyet

Fixes: #9467
  • Loading branch information
srl295 committed Jun 5, 2024
1 parent f156a72 commit 55a025c
Show file tree
Hide file tree
Showing 7 changed files with 299 additions and 174 deletions.
4 changes: 4 additions & 0 deletions core/src/core_icu.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/normalizer2.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/regex.h"
#include "unicode/utext.h"

#include "keyman_core.h"
#include "debuglog.h"
Expand Down
176 changes: 15 additions & 161 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// TEMP
#define KMN_NO_ICU 0
/*
Copyright: © SIL International.
Description: This is an implementation of the LDML keyboard spec 3.0.
Expand Down Expand Up @@ -439,20 +437,14 @@ reorder_group::apply(std::u32string &str) const {
}

transform_entry::transform_entry(const transform_entry &other)
: fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr), fMapFromStrId(other.fMapFromStrId),
: fFrom(other.fFrom), fTo(other.fTo), fFromPattern(other.fFromPattern), fMapFromStrId(other.fMapFromStrId),
fMapToStrId(other.fMapToStrId), fMapFromList(other.fMapFromList), fMapToList(other.fMapToList),
normalization_disabled(other.normalization_disabled) {
if (other.fFromPattern) {
// clone pattern
fFromPattern.reset(other.fFromPattern->clone());
}
}

transform_entry::transform_entry(const std::u32string &from, const std::u32string &to)
: fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList(), normalization_disabled(false) {
: fFrom(from), fTo(to), fFromPattern(from), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList(), normalization_disabled(false) {
assert(!fFrom.empty());

init();
}

transform_entry::transform_entry(
Expand All @@ -463,15 +455,20 @@ transform_entry::transform_entry(
const kmx::kmx_plus &kplus,
bool &valid,
bool norm_disabled)
: fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo), normalization_disabled(norm_disabled) {
: fFrom(from), fTo(to), fFromPattern(), fMapFromStrId(mapFrom), fMapToStrId(mapTo), normalization_disabled(norm_disabled) {
if (!valid)
return; // exit early
assert(!fFrom.empty()); // TODO-LDML: should not happen?
assert((fMapFromStrId == 0) == (fMapToStrId == 0)); // we have both or we have neither.
assert(kplus.strs != nullptr);
assert(kplus.vars != nullptr);
assert(kplus.elem != nullptr);
if(!init()) {
std::u32string from2 = fFrom;
if (!normalization_disabled) {
// normalize, including markers, for regex
normalize_nfd_markers(from2, regex_sentinel);
}
if (!fFromPattern.init(from2)) {
valid = false;
}

Expand Down Expand Up @@ -508,160 +505,17 @@ transform_entry::transform_entry(
}
}

bool
transform_entry::init() {
if (fFrom.empty()) {
return false;
}
// TODO-LDML: if we have mapFrom, may need to do other processing.
std::u32string from2 = fFrom;
if (!normalization_disabled) {
// normalize, including markers, for regex
normalize_nfd_markers(from2, regex_sentinel);
}
std::u16string patstr = km::core::kmx::u32string_to_u16string(from2);
UErrorCode status = U_ZERO_ERROR;
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
// add '$' to match to end
patustr.append(u'$'); // TODO-LDML: may need to escape some markers. Marker #91 will look like a `[` to the pattern
fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status));
return (UASSERT_SUCCESS(status));
}

size_t
transform_entry::apply(const std::u32string &input, std::u32string &output) const {
assert(fFromPattern);
// TODO-LDML: Really? can't go from u32 to UnicodeString?
// TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
UErrorCode status = U_ZERO_ERROR;
const std::u16string matchstr = km::core::kmx::u32string_to_u16string(input);
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
// TODO-LDML: create a new Matcher every time. These could be cached and reset.
std::unique_ptr<icu::RegexMatcher> matcher(fFromPattern->matcher(matchustr, status));
if (!UASSERT_SUCCESS(status)) {
return 0; // TODO-LDML: return error
}

if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str
return 0; // no match
}

// TODO-LDML: this is UTF-16 len, not UTF-32 len!!
// TODO-LDML: if we had an underlying UText this would be simpler.
int32_t matchStart = matcher->start(status);
int32_t matchEnd = matcher->end(status);
if (!UASSERT_SUCCESS(status)) {
return 0; // TODO-LDML: return error
}
// extract..
const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd);
// preflight to UTF-32 to get length
UErrorCode substrStatus = U_ZERO_ERROR; // throwaway status
// we need the UTF-32 matchLen for our return.
auto matchLen = substr.toUTF32(nullptr, 0, substrStatus);

// should have matched something.
assert(matchLen > 0);

// now, do the replace.

/** this is the 'to' or other replacement string.*/
icu::UnicodeString rustr;
if (fMapFromStrId == 0) {
// Normal case: not a map.
// This replace will apply $1, $2 etc.
// Convert the fTo into u16 TODO-LDML (we could cache this?)
const std::u16string rstr = km::core::kmx::u32string_to_u16string(fTo);
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
} else {
// Set map case: mapping from/to

// we actually need the group(1) string here.
// this is only the content in parenthesis ()
icu::UnicodeString group1 = matcher->group(1, status);
if (!UASSERT_SUCCESS(status)) {
// TODO-LDML: could be a malformed from pattern
return 0; // TODO-LDML: return error
}
// now, how long is group1 in UTF-32, hmm?
UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status
auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus);
char32_t *s = new char32_t[group1Len + 1];
assert(s != nullptr); // TODO-LDML: OOM
// convert
group1.toUTF32((UChar32 *)s, group1Len + 1, status);
if (!UASSERT_SUCCESS(status)) {
return 0; // TODO-LDML: memory issue
}
std::u32string match32(s, group1Len); // taken from just group1
// clean up buffer
delete [] s;

// Now we're ready to do the actual mapping.

// 1., we need to find the index in the source set.
auto matchIndex = findIndexFrom(match32);
assert(matchIndex != -1L); // TODO-LDML: not matching shouldn't happen, the regex wouldn't have matched.
// we already asserted on load that the from and to sets have the same cardinality.

// 2. get the target string, convert to utf-16
// we use the same matchIndex that was just found
const std::u16string rstr = km::core::kmx::u32string_to_u16string(fMapToList.at(matchIndex));

// 3. update the UnicodeString for replacement
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
// and we return to the regular code flow.
}
// here we replace the match output. No normalization, yet.
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status);
if (!UASSERT_SUCCESS(status)) {
// TODO-LDML: could fail here due to bad input (syntax err)
return 0;
}
// entireOutput includes all of 'input', but modified. Need to substring it.
icu::UnicodeString outu = entireOutput.tempSubString(matchStart);

// Special case if there's no output, save some allocs
if (outu.length() == 0) {
output.clear();
} else {
// TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
UErrorCode preflightStatus = U_ZERO_ERROR;
// calculate how big the buffer is
auto out32len = outu.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
// allocate
std::unique_ptr<char32_t[]> s(new char32_t[out32len + 1]);
assert(s);
if (!s) {
return 0; // TODO-LDML: allocation failed
}
// convert
outu.toUTF32((UChar32 *)(s.get()), out32len + 1, status);
if (!UASSERT_SUCCESS(status)) {
return 0; // TODO-LDML: memory issue
}
output.assign(s.get(), out32len);
// NOW do a marker-safe normalize
if (!normalization_disabled && !normalize_nfd_markers(output)) {
auto result = fFromPattern.apply(input, output, fTo, fMapFromList, fMapToList);
// NOW do a marker-safe normalize
if (result != 0 && !output.empty() && !normalization_disabled) {
if (!normalize_nfd_markers(output)) {
DebugLog("normalize_nfd_markers(output) failed");
return 0; // TODO-LDML: normalization failed.
}
}
return matchLen;
}

int32_t transform_entry::findIndexFrom(const std::u32string &match) const {
return findIndex(match, fMapFromList);
}

int32_t transform_entry::findIndex(const std::u32string &match, const std::deque<std::u32string> list) {
int32_t index = 0;
for(auto e = list.begin(); e < list.end(); e++, index++) {
if (match == *e) {
return index;
return 0; // TODO-LDML: normalization failed.
}
}
return -1; // not found
return result;
}

any_group::any_group(const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
Expand Down
10 changes: 2 additions & 8 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@
#include <utility>
#include "debuglog.h"

#include "core_icu.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/regex.h"
#include "unicode/utext.h"
#include "util_regex.hpp"

namespace km {
namespace core {
Expand Down Expand Up @@ -111,14 +107,12 @@ class transform_entry {
private:
const std::u32string fFrom;
const std::u32string fTo;
std::unique_ptr<icu::RegexPattern> fFromPattern;
km::core::util::km_regex fFromPattern;

const KMX_DWORD fMapFromStrId;
const KMX_DWORD fMapToStrId;
std::deque<std::u32string> fMapFromList;
std::deque<std::u32string> fMapToList;
/** Internal function to setup pattern string @returns true on success */
bool init();
bool normalization_disabled;
/** @returns the index of the item in the fMapFromList list, or -1 */
int32_t findIndexFrom(const std::u32string &match) const;
Expand Down
1 change: 1 addition & 0 deletions core/src/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ kmx_files = files(
'km_core_processevent_api.cpp',
'jsonpp.cpp',
'util_normalize.cpp',
'util_regex.cpp',
'core_icu.cpp',
'ldml/ldml_processor.cpp',
'ldml/ldml_transforms.cpp',
Expand Down
Loading

0 comments on commit 55a025c

Please sign in to comment.