Skip to content

Rollup of 5 pull requests #70499

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Mar 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
903f67d
Avoid re-fetching Unicode data
Mark-Simulacrum Mar 19, 2020
7c4baed
Dynamically choose best chunk size
Mark-Simulacrum Mar 19, 2020
580a634
Generate tests for Unicode property data
Mark-Simulacrum Mar 19, 2020
6c7691a
Pre-pop zero chunks before mapping LAST_CHUNK_MAP
Mark-Simulacrum Mar 20, 2020
b0e121d
Shrink bitset words through functional mapping
Mark-Simulacrum Mar 21, 2020
7b29b70
Add a right shift mapping
Mark-Simulacrum Mar 21, 2020
5f71d98
Deduplicate test and primary range_search definitions
Mark-Simulacrum Mar 21, 2020
233ab2f
Push the byte of LAST_CHUNK_MAP into the array
Mark-Simulacrum Mar 21, 2020
a7ec6f8
Arrange for zero to be canonical
Mark-Simulacrum Mar 21, 2020
af243d4
Avoid relying on const parameters to function
Mark-Simulacrum Mar 21, 2020
33b9e6f
Add richer printing
Mark-Simulacrum Mar 24, 2020
95870e2
Add long error explanation for E0703
Polkaverse Mar 26, 2020
e5f4dad
Refactor code
Polkaverse Mar 26, 2020
6a744ea
Create output dir in rustdoc markdown render
TimotheeGerber Mar 26, 2020
28fe986
fix suggested changes
Polkaverse Mar 27, 2020
0d90612
Refactor changes
Polkaverse Mar 27, 2020
c09b5a3
Refactor changes
Polkaverse Mar 27, 2020
9c1ceec
Add skip list based implementation for smaller encoding
Mark-Simulacrum Mar 26, 2020
b6bc906
Remove separate encoding for a single nonzero-mapping byte
Mark-Simulacrum Mar 27, 2020
ad679a7
Update the documentation comment
Mark-Simulacrum Mar 27, 2020
6d886af
Fix rustdoc.css CSS tab-size property
wtfsck Mar 28, 2020
dc8a985
Replace last mention of IRC with Discord
bkaestner Mar 28, 2020
1f13089
Rollup merge of #70418 - PankajChaudhary5:master, r=Dylan-DPC
Dylan-DPC Mar 28, 2020
bbd3634
Rollup merge of #70448 - TimotheeGerber:rustdoc-create-output-dir, r=…
Dylan-DPC Mar 28, 2020
7f1e626
Rollup merge of #70486 - Mark-Simulacrum:unicode-shrink, r=dtolnay
Dylan-DPC Mar 28, 2020
f611193
Rollup merge of #70493 - 0xd4d:rustdoc-tab-size, r=GuillaumeGomez
Dylan-DPC Mar 28, 2020
e3ccd5b
Rollup merge of #70495 - bkaestner:master, r=Mark-Simulacrum
Dylan-DPC Mar 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 0 additions & 25 deletions src/libcore/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,3 @@ pub use unicode_data::lowercase::lookup as Lowercase;
pub use unicode_data::n::lookup as N;
pub use unicode_data::uppercase::lookup as Uppercase;
pub use unicode_data::white_space::lookup as White_Space;

#[inline(always)]
fn range_search<const N: usize, const N1: usize, const N2: usize>(
needle: u32,
chunk_idx_map: &[u8; N],
(last_chunk_idx, last_chunk_mapping): (u16, u8),
bitset_chunk_idx: &[[u8; 16]; N1],
bitset: &[u64; N2],
) -> bool {
let bucket_idx = (needle / 64) as usize;
let chunk_map_idx = bucket_idx / 16;
let chunk_piece = bucket_idx % 16;
let chunk_idx = if chunk_map_idx >= N {
if chunk_map_idx == last_chunk_idx as usize {
last_chunk_mapping
} else {
return false;
}
} else {
chunk_idx_map[chunk_map_idx]
};
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
let word = bitset[(idx as usize)];
(word & (1 << (needle % 64) as u64)) != 0
}
957 changes: 443 additions & 514 deletions src/libcore/unicode/unicode_data.rs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/librustc_error_codes/error_codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ E0698: include_str!("./error_codes/E0698.md"),
E0699: include_str!("./error_codes/E0699.md"),
E0700: include_str!("./error_codes/E0700.md"),
E0701: include_str!("./error_codes/E0701.md"),
E0703: include_str!("./error_codes/E0703.md"),
E0704: include_str!("./error_codes/E0704.md"),
E0705: include_str!("./error_codes/E0705.md"),
E0706: include_str!("./error_codes/E0706.md"),
Expand Down Expand Up @@ -603,7 +604,6 @@ E0751: include_str!("./error_codes/E0751.md"),
// E0694, // an unknown tool name found in scoped attributes
E0696, // `continue` pointing to a labeled block
// E0702, // replaced with a generic attribute input check
E0703, // invalid ABI
// E0707, // multiple elided lifetimes used in arguments of `async fn`
E0708, // `async` non-`move` closures with parameters are not currently
// supported
Expand Down
17 changes: 17 additions & 0 deletions src/librustc_error_codes/error_codes/E0703.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Invalid ABI (Application Binary Interface) used in the code.

Erroneous code example:

```compile_fail,E0703
extern "invalid" fn foo() {} // error!
# fn main() {}
```

At present few predefined ABI's (like Rust, C, system, etc.) can be
used in Rust. Verify that the ABI is predefined. For example you can
replace the given ABI from 'Rust'.

```
extern "Rust" fn foo() {} // ok!
# fn main() { }
```
4 changes: 2 additions & 2 deletions src/librustdoc/html/static/rustdoc.css
Original file line number Diff line number Diff line change
Expand Up @@ -1082,8 +1082,8 @@ h3 > .collapse-toggle, h4 > .collapse-toggle {

pre.rust {
position: relative;
tab-width: 4;
-moz-tab-width: 4;
tab-size: 4;
-moz-tab-size: 4;
}

.search-failed {
Expand Down
7 changes: 6 additions & 1 deletion src/librustdoc/markdown.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::fs::File;
use std::fs::{create_dir_all, File};
use std::io::prelude::*;
use std::path::PathBuf;

Expand Down Expand Up @@ -40,6 +40,11 @@ pub fn render(
diag: &rustc_errors::Handler,
edition: Edition,
) -> i32 {
if let Err(e) = create_dir_all(&options.output) {
diag.struct_err(&format!("{}: {}", options.output.display(), e)).emit();
return 4;
}

let mut output = options.output;
output.push(input.file_name().unwrap());
output.set_extension("html");
Expand Down
4 changes: 3 additions & 1 deletion src/libstd/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@
//! pull-requests for your suggested changes.
//!
//! Contributions are appreciated! If you see a part of the docs that can be
//! improved, submit a PR, or chat with us first on irc.mozilla.org #rust-docs.
//! improved, submit a PR, or chat with us first on [Discord][rust-discord]
//! #docs.
//!
//! # A Tour of The Rust Standard Library
//!
Expand Down Expand Up @@ -194,6 +195,7 @@
//! [multithreading]: thread/index.html
//! [other]: #what-is-in-the-standard-library-documentation
//! [primitive types]: ../book/ch03-02-data-types.html
//! [rust-discord]: https://discord.gg/rust-lang

#![stable(feature = "rust1", since = "1.0.0")]
#![doc(
Expand Down
1 change: 1 addition & 0 deletions src/test/ui/codemap_tests/unicode.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ LL | extern "路濫狼á́́" fn foo() {}

error: aborting due to previous error

For more information about this error, try `rustc --explain E0703`.
1 change: 1 addition & 0 deletions src/test/ui/parser/issue-8537.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ LL | "invalid-ab_isize"

error: aborting due to previous error

For more information about this error, try `rustc --explain E0703`.
186 changes: 182 additions & 4 deletions src/tools/unicode-table-generator/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,83 @@
//! This implements the core logic of the compression scheme used to compactly
//! encode Unicode properties.
//!
//! We have two primary goals with the encoding: we want to be compact, because
//! these tables often end up in ~every Rust program (especially the
//! grapheme_extend table, used for str debugging), including those for embedded
//! targets (where space is important). We also want to be relatively fast,
//! though this is more of a nice to have rather than a key design constraint.
//! It is expected that libraries/applications which are performance-sensitive
//! to Unicode property lookups are extremely rare, and those that care may find
//! the tradeoff of the raw bitsets worth it. For most applications, a
//! relatively fast but much smaller (and as such less cache-impacting, etc.)
//! data set is likely preferable.
//!
//! We have two separate encoding schemes: a skiplist-like approach, and a
//! compressed bitset. The datasets we consider mostly use the skiplist (it's
//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
//! the bitset to be worthwhile -- for those sets the biset is a 2x size win.
//! Since the bitset is also faster, this seems an obvious choice. (As a
//! historical note, the bitset was also the prior implementation, so its
//! relative complexity had already been paid).
//!
//! ## The bitset
//!
//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
//! over 17 kilobytes of data per character set -- way too much for our
//! purposes.
//!
//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
//! not skipping the small 'gap') is associated into words (u64) and
//! deduplicated. On random data, this would be useless; on our data, this is
//! incredibly beneficial -- our data sets have (far) less than 256 unique
//! words.
//!
//! This gives us an array that maps `u8 -> word`; the current algorithm does
//! not handle the case of more than 256 unique words, but we are relatively far
//! from coming that close.
//!
//! With that scheme, we now have a single byte for every 64 codepoints.
//!
//! We further chunk these by some constant N (between 1 and 64 per group,
//! dynamically chosen for smallest size), and again deduplicate and store in an
//! array (u8 -> [u8; N]).
//!
//! The bytes of this array map into the words from the bitset above, but we
//! apply another trick here: some of these words are similar enough that they
//! can be represented by some function of another word. The particular
//! functions chosen are rotation, inversion, and shifting (right).
//!
//! ## The skiplist
//!
//! The skip list arose out of the desire for an even smaller encoding than the
//! bitset -- and was the answer to the question "what is the smallest
//! representation we can imagine?". However, it is not necessarily the
//! smallest, and if you have a better proposal, please do suggest it!
//!
//! This is a relatively straightforward encoding. First, we break up all the
//! ranges in the input data into offsets from each other, essentially a gap
//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
//! store those directly. We make use of the larger gaps (which are nicely
//! interspersed already) throughout the dataset to index this data set.
//!
//! In particular, each run of small gaps (terminating in a large gap) is
//! indexed in a separate dataset. That data set stores an index into the
//! primary offset list and a prefix sum of that offset list. These are packed
//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
//!
//! Lookup proceeds via a binary search in the index and then a straightforward
//! linear scan (adding up the offsets) until we reach the needle, and then the
//! index of that offset is utilized as the answer to whether we're in the set
//! or not.

use std::collections::{BTreeMap, HashMap};
use std::ops::Range;
use ucd_parse::Codepoints;

mod case_mapping;
mod raw_emitter;
mod skiplist;
mod unicode_download;

use raw_emitter::{emit_codepoints, RawEmitter};
Expand Down Expand Up @@ -152,9 +226,17 @@ fn main() {
std::process::exit(1);
});

// Optional test path, which is a Rust source file testing that the unicode
// property lookups are correct.
let test_path = std::env::args().nth(2);

let unicode_data = load_data();
let ranges_by_property = &unicode_data.ranges;

if let Some(path) = test_path {
std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
}

let mut total_bytes = 0;
let mut modules = Vec::new();
for (property, ranges) in ranges_by_property {
Expand All @@ -163,7 +245,16 @@ fn main() {
emit_codepoints(&mut emitter, &ranges);

modules.push((property.to_lowercase().to_string(), emitter.file));
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
println!(
"{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
property,
emitter.bytes_used,
datapoints,
ranges.len(),
ranges.first().unwrap().start,
ranges.last().unwrap().end,
emitter.desc,
);
total_bytes += emitter.bytes_used;
}

Expand All @@ -173,7 +264,10 @@ fn main() {
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
);

table_file.push_str("use super::range_search;\n\n");
// Include the range search function
table_file.push('\n');
table_file.push_str(include_str!("range_search.rs"));
table_file.push('\n');

table_file.push_str(&version());

Expand Down Expand Up @@ -236,26 +330,110 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
out
}

fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
let mut s = String::new();
s.push_str("#![allow(incomplete_features, unused)]\n");
s.push_str("#![feature(const_generics)]\n\n");
s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
s.push_str(&format!("#[path = \"{}\"]\n", data_path));
s.push_str("mod unicode_data;\n\n");

s.push_str("\nfn main() {\n");

for (property, ranges) in ranges {
s.push_str(&format!(r#" println!("Testing {}");"#, property));
s.push('\n');
s.push_str(&format!(" {}_true();\n", property.to_lowercase()));
s.push_str(&format!(" {}_false();\n", property.to_lowercase()));
let mut is_true = Vec::new();
let mut is_false = Vec::new();
for ch_num in 0..(std::char::MAX as u32) {
if std::char::from_u32(ch_num).is_none() {
continue;
}
if ranges.iter().any(|r| r.contains(&ch_num)) {
is_true.push(ch_num);
} else {
is_false.push(ch_num);
}
}

s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase()));
generate_asserts(&mut s, property, &is_true, true);
s.push_str(" }\n\n");
s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase()));
generate_asserts(&mut s, property, &is_false, false);
s.push_str(" }\n\n");
}

s.push_str("}");
s
}

fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
for range in ranges_from_set(points) {
if range.end == range.start + 1 {
s.push_str(&format!(
" assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
if truthy { "" } else { "!" },
property.to_lowercase(),
std::char::from_u32(range.start).unwrap(),
range.start,
));
} else {
s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
s.push_str(&format!(
" assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
if truthy { "" } else { "!" },
property.to_lowercase(),
));
s.push_str(" }\n");
}
}
}

fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
merge_ranges(&mut ranges);
ranges
}

fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
loop {
let mut new_ranges = Vec::new();
let mut idx_iter = 0..(ranges.len() - 1);
let mut should_insert_last = true;
while let Some(idx) = idx_iter.next() {
let cur = ranges[idx].clone();
let next = ranges[idx + 1].clone();
if cur.end == next.start {
let _ = idx_iter.next(); // skip next as we're merging it in
if idx_iter.next().is_none() {
// We're merging the last element
should_insert_last = false;
}
new_ranges.push(cur.start..next.end);
} else {
// We're *not* merging the last element
should_insert_last = true;
new_ranges.push(cur);
}
}
new_ranges.push(ranges.last().unwrap().clone());
if should_insert_last {
new_ranges.push(ranges.last().unwrap().clone());
}
if new_ranges.len() == ranges.len() {
*ranges = new_ranges;
break;
} else {
*ranges = new_ranges;
}
}

let mut last_end = None;
for range in ranges {
if let Some(last) = last_end {
assert!(range.start > last, "{:?}", range);
}
last_end = Some(range.end);
}
}
Loading