Skip to content

Commit

Permalink
feat(core): speedup NFD boundary table
Browse files Browse the repository at this point in the history
- use RLE encoding, thanks @mcdurdin
- much smaller table and faster lookup

Fixes: #9467
  • Loading branch information
srl295 committed Jun 4, 2024
1 parent 7fbeea8 commit e967386
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 947 deletions.
12 changes: 7 additions & 5 deletions core/src/util_normalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,14 @@ bool is_nfd(const std::u32string& str) {
bool has_nfd_boundary_before(km_core_usv cp) {
#ifdef __EMSCRIPTEN__
// it's a negative table. entries in the table mean returning false. non-entries return true.
for (int i=0;;i++) {
auto t = km_noBoundaryBefore[i];
if (t == 0) return true;
if (t > cp) return true;
if (t == cp) return false;
for (auto i=0;i<(km_noBoundaryBefore_entries*2);i+=2) {
auto start = km_noBoundaryBefore[i+0];
if (start > cp) return true;
auto count = km_noBoundaryBefore[i+1];
auto limit = start+count;
if (cp >= start && cp < limit) return false;
}
return true; // fallthrough
#else
UErrorCode status = U_ZERO_ERROR;
auto nfd = getNFD(status);
Expand Down
47 changes: 32 additions & 15 deletions core/tools/norm_unicode_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#include <assert.h>

Expand All @@ -23,32 +24,48 @@ write_nfd_table(const char *NFD_FILE) {
f << "#pragma once" << std::endl;
f << "#define KM_HASBOUNDARYBEFORE_UNICODE_VERSION \"" << U_UNICODE_VERSION << "\"" << std::endl;
f << "#define KM_HASBOUNDARYBEFORE_ICU_VERSION \"" << U_ICU_VERSION << "\"" << std::endl;
f << "static char32_t km_noBoundaryBefore[] = {" << std::endl;
// we're going to need an NFD normalizer
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
assert(U_SUCCESS(status));

std::vector<km_core_usv> noBoundary;

for (km_core_usv ch = 0; ch < 0x10FFFF; ch++) {
bool bb = nfd->hasBoundaryBefore(ch);
assert(!(ch == 0 && !bb)); // assert that we can use U+0000 as a terminator

// TODO: This test may be better in test_unicode
// icu::UnicodeString s;
// s.append((UChar32)ch);
// bool lccc = nfd->isNormalized(s, status) && u_getCombiningClass(ch) == 0;
// assert(U_SUCCESS(status));
// if (bb != lccc) {
// printf("0x%04x - bb=%s but lccc=%s\n", (unsigned int)ch, bb ? "y" : "n", lccc ? "y" : "n");
// }
// assert(bb == lccc);
if (bb) continue; //only emit nonboundary
// char key[10];
// snprintf(key, 10, "%04X", (unsigned int)ch);
f << "\t0x" << std::hex << ch << "," << std::endl;
noBoundary.push_back(ch);
}

std::vector<std::pair<km_core_usv,std::size_t>> runs; // start,len

km_core_usv first = 0;
km_core_usv last = 0;
for(auto i = noBoundary.begin(); i <= noBoundary.end(); i++) {
if (first == 0) {
first = last = *i;
} else {
last++;
if(i == noBoundary.end() || *i != last) {
// end of a run
runs.emplace_back(first, last - first);
if (i != noBoundary.end()) {
// setup for next
first = last = *i;
}
}
}
}
f << "#define km_noBoundaryBefore_entries " << runs.size() << "\n";

f << "static char32_t km_noBoundaryBefore[km_noBoundaryBefore_entries * 2 ] = {" << std::endl;

for (auto i = runs.begin(); i < runs.end(); i++) {
f << "\t0x" << std::hex << i->first << std::dec << ",\t " << i->second << ", // ...0x" << std::hex << (i->first+i->second-1) << std::endl;
}

// termination
f << "\t0x" << std::hex << 0 << "," << std::endl;
f << "};" << std::endl;
return 0;
}
Expand Down
Loading

0 comments on commit e967386

Please sign in to comment.