forked from rust-lang/rust
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rollup merge of rust-lang#70486 - Mark-Simulacrum:unicode-shrink, r=d…
…tolnay Shrink Unicode tables (even more) This shrinks the Unicode tables further, building upon the wins in rust-lang#68232 (the previous counts differ due to an interim Unicode version update, see rust-lang#69929. The new data structure is slower by around 3x, on the benchmark of looking up every Unicode scalar value in each data set sequentially in every data set included. Note that for ASCII, the exposed functions on `char` optimize with direct branches, so ASCII will retain the same performance regardless of internal optimizations (or the reverse). Also, note that the size reduction due to the skip list (from where the performance losses come) is around 40%, and, as a result, I believe the performance loss is acceptable, as the routines are still quite fast. Anywhere where this is hot, should probably be using a custom data structure anyway (e.g., a raw bitset) or something optimized for frequently seen values, etc. This PR updates both the bitset data structure, and introduces a new data structure similar to a skip list. For more details, see the [main.rs] of the table generator, which describes both. The commits mostly work individually and document size wins. As before, this is tested on all valid chars to have the same results as nightly (and the canonical Unicode data sets), happily, no bugs were found. [main.rs]: https://github.com/rust-lang/rust/blob/fb4a715e18b/src/tools/unicode-table-generator/src/main.rs Set | Previous | New | % of old | Codepoints | Ranges | ----------------|---------:|------:|-----------:|-----------:|-------:| Alphabetic | 3055 | 1599 | 52% | 132875 | 695 | Case Ignorable | 2136 | 949 | 44% | 2413 | 410 | Cased | 934 | 359 | 38% | 4286 | 141 | Cc | 43 | 9 | 20% | 65 | 2 | Grapheme Extend | 1774 | 813 | 46% | 1979 | 344 | Lowercase | 985 | 867 | 88% | 2344 | 652 | N | 1266 | 419 | 33% | 1781 | 133 | Uppercase | 934 | 777 | 83% | 1911 | 643 | White_Space | 140 | 37 | 26% | 25 | 10 | ----------------|----------|-------|------------|------------|--------| Total | 11267 | 5829 | 51% | - | - |
- Loading branch information
Showing
7 changed files
with
1,152 additions
and
652 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#[inline(always)] | ||
fn bitset_search< | ||
const N: usize, | ||
const CHUNK_SIZE: usize, | ||
const N1: usize, | ||
const CANONICAL: usize, | ||
const CANONICALIZED: usize, | ||
>( | ||
needle: u32, | ||
chunk_idx_map: &[u8; N], | ||
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], | ||
bitset_canonical: &[u64; CANONICAL], | ||
bitset_canonicalized: &[(u8, u8); CANONICALIZED], | ||
) -> bool { | ||
let bucket_idx = (needle / 64) as usize; | ||
let chunk_map_idx = bucket_idx / CHUNK_SIZE; | ||
let chunk_piece = bucket_idx % CHUNK_SIZE; | ||
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { | ||
v | ||
} else { | ||
return false; | ||
}; | ||
let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize; | ||
let word = if let Some(word) = bitset_canonical.get(idx) { | ||
*word | ||
} else { | ||
let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()]; | ||
let mut word = bitset_canonical[real_idx as usize]; | ||
let should_invert = mapping & (1 << 6) != 0; | ||
if should_invert { | ||
word = !word; | ||
} | ||
// Lower 6 bits | ||
let quantity = mapping & ((1 << 6) - 1); | ||
if mapping & (1 << 7) != 0 { | ||
// shift | ||
word >>= quantity as u64; | ||
} else { | ||
word = word.rotate_left(quantity as u32); | ||
} | ||
word | ||
}; | ||
(word & (1 << (needle % 64) as u64)) != 0 | ||
} | ||
|
||
fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { | ||
short_offset_run_header & ((1 << 21) - 1) | ||
} | ||
|
||
fn decode_length(short_offset_run_header: u32) -> usize { | ||
(short_offset_run_header >> 21) as usize | ||
} | ||
|
||
#[inline(always)] | ||
fn skip_search<const SOR: usize, const OFFSETS: usize>( | ||
needle: u32, | ||
short_offset_runs: &[u32; SOR], | ||
offsets: &[u8; OFFSETS], | ||
) -> bool { | ||
// Note that this *cannot* be past the end of the array, as the last | ||
// element is greater than std::char::MAX (the largest possible needle). | ||
// | ||
// So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct | ||
// location cannot be past it, so Err(idx) != length either. | ||
// | ||
// This means that we can avoid bounds checking for the accesses below, too. | ||
let last_idx = | ||
match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { | ||
Ok(idx) => idx + 1, | ||
Err(idx) => idx, | ||
}; | ||
|
||
let mut offset_idx = decode_length(short_offset_runs[last_idx]); | ||
let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { | ||
decode_length(*next) - offset_idx | ||
} else { | ||
offsets.len() - offset_idx | ||
}; | ||
let prev = | ||
last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0); | ||
|
||
let total = needle - prev; | ||
let mut prefix_sum = 0; | ||
for _ in 0..(length - 1) { | ||
let offset = offsets[offset_idx]; | ||
prefix_sum += offset as u32; | ||
if prefix_sum > total { | ||
break; | ||
} | ||
offset_idx += 1; | ||
} | ||
offset_idx % 2 == 1 | ||
} |
Oops, something went wrong.