Skip to content

Commit 8a683ff

Browse files
hsivonenManishearthvalenting
authored
Merge idna-v1x to main (#990)
* Adjust Punycode overflow checks * The change made in 1.0.0 incorrectly assumed that the input length limit removed the need to do overflow check when decoding. Now the internal-caller length limit is taken as a permission to skip overflow checks only when encoding. * The RFC gives overflow checking pre-flight math for languages like that don't have checked math. Since Rust does, the code now uses checked_add and checked_mul instead of pre-flight when overflow checks are performed. * Remove no_std category (crates.io doesn't support it, and it is now rejected), use keywords instead * Add benches that use the main idna 1.0 entry point in idna and url * Put the Unicode back end behind an adapter crate * Split fastest ASCII fast path from the rest * Bench hyphen in a domain that is otherwise lower-case ASCII * Adjust MSRV * Add README remarks about alternative Unicode back ends * Change the idna_adapter dependency to crates.io * Address clippy lints * Increment version number of idna to 1.0.3 * Test MSRV with idna unicode-rs back end and test ICU4X back end with 1.67 * Prepare url crate for publication with idna 1.0.3 (#987) --------- Co-authored-by: Manish Goregaokar <manishsmail@gmail.com> Co-authored-by: Valentin Gosu <1454649+valenting@users.noreply.github.com>
1 parent 08a3268 commit 8a683ff

22 files changed

+8539
-30453
lines changed

.github/workflows/main.yml

+9-4
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ jobs:
1515
strategy:
1616
matrix:
1717
os: [ubuntu-latest, macos-latest, windows-latest]
18-
rust: [1.56.0, stable, beta, nightly]
18+
rust: [1.57.0, 1.67.0, stable, beta, nightly]
1919
exclude:
2020
- os: macos-latest
21-
rust: 1.56.0
21+
rust: 1.67.0
2222
- os: windows-latest
23-
rust: 1.56.0
23+
rust: 1.67.0
2424
- os: macos-latest
2525
rust: beta
2626
- os: windows-latest
@@ -37,6 +37,10 @@ jobs:
3737
toolchain: ${{ matrix.rust }}
3838
# Add toolchain for no_std tests
3939
- run: rustup toolchain install nightly
40+
- name: Downgrade idna_adapter on Rust 1.57.0
41+
if: |
42+
matrix.rust == '1.57.0'
43+
run: cargo update -p idna_adapter --precise 1.1.0
4044
- name: Add `aarch64-unknown-none` toolchain for `no_std` tests
4145
if: |
4246
matrix.os == 'ubuntu-latest' &&
@@ -54,7 +58,8 @@ jobs:
5458
- name: Run debugger_visualizer tests
5559
if: |
5660
matrix.os == 'windows-latest' &&
57-
matrix.rust != '1.56.0'
61+
matrix.rust != '1.57.0' &&
62+
matrix.rust != '1.67.0'
5863
run: cargo test --test debugger_visualizer --features "url/debugger_visualizer,url_debug_tests/debugger_visualizer" -- --test-threads=1 || echo "debugger test failed"
5964
continue-on-error: true # Fails on GH actions, but not locally.
6065
- name: Test `no_std` support

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,7 @@ URL library for Rust, based on the [URL Standard](https://url.spec.whatwg.org/).
1212
[Documentation](https://docs.rs/url)
1313

1414
Please see [UPGRADING.md](https://github.com/servo/rust-url/blob/main/UPGRADING.md) if you are upgrading from a previous version.
15+
16+
## Alternative Unicode back ends
17+
18+
`url` depends on the `idna` crate. By default, `idna` uses [ICU4X](https://github.com/unicode-org/icu4x/) as its Unicode back end. If you wish to opt for different tradeoffs between correctness, run-time performance, binary size, compile time, and MSRV, please see the [README of the latest version of the `idna_adapter` crate](https://docs.rs/crate/idna_adapter/latest) for how to opt into a different Unicode back end.

idna/Cargo.toml

+12-7
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
[package]
22
name = "idna"
3-
version = "0.5.0"
3+
version = "1.0.3"
44
authors = ["The rust-url developers"]
55
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
6-
categories = ["no_std"]
6+
keywords = ["no_std", "web", "http"]
77
repository = "https://github.com/servo/rust-url/"
88
license = "MIT OR Apache-2.0"
99
autotests = false
1010
edition = "2018"
11-
rust-version = "1.51"
11+
rust-version = "1.57" # For panic in const context
1212

1313
[lib]
1414
doctest = false
1515

1616
[features]
17-
default = ["std"]
18-
std = ["alloc", "unicode-bidi/std", "unicode-normalization/std"]
17+
default = ["std", "compiled_data"]
18+
std = ["alloc"]
1919
alloc = []
20+
compiled_data = ["idna_adapter/compiled_data"]
2021

2122
[[test]]
2223
name = "tests"
@@ -25,15 +26,19 @@ harness = false
2526
[[test]]
2627
name = "unit"
2728

29+
[[test]]
30+
name = "unitbis"
31+
2832
[dev-dependencies]
2933
assert_matches = "1.3"
3034
bencher = "0.1"
3135
tester = "0.9"
3236
serde_json = "1.0"
3337

3438
[dependencies]
35-
unicode-bidi = { version = "0.3.10", default-features = false, features = ["hardcoded-data"] }
36-
unicode-normalization = { version = "0.1.22", default-features = false }
39+
utf8_iter = "1.0.4"
40+
smallvec = { version = "1.13.1", features = ["const_generics"]}
41+
idna_adapter = "1"
3742

3843
[[bench]]
3944
name = "all"

idna/README.md

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# `idna`
2+
3+
IDNA library for Rust implementing [UTS 46: Unicode IDNA Compatibility Processing](https://www.unicode.org/reports/tr46/) as parametrized by the [WHATWG URL Standard](https://url.spec.whatwg.org/#idna).
4+
5+
## What it does
6+
7+
* An implementation of UTS 46 is provided, with configurable ASCII deny list (e.g. STD3 or WHATWG rules).
8+
* A callback mechanism is provided for pluggable logic for deciding if a label is deemed potentially too misleading to render as Unicode in a user interface.
9+
* Errors are marked as U+FFFD REPLACEMENT CHARACTERs in Unicode output so that locations of errors may be illustrated to the user.
10+
11+
## What it does not do
12+
13+
* There is no default/sample policy provided for the callback mechanism mentioned above.
14+
* Only UTS 46 is implemented: There is no API to request strictly IDNA 2008 only or strictly IDNA 2003 only.
15+
* There is no API for categorizing errors beyond there being an error.
16+
* Checks that are configurable in UTS 46 but that the WHATWG URL Standard always set a particular way (regardless of the _beStrict_ flag in the URL Standard) cannot be configured (with the exception of the old deprecated API supporting transitional processing).
17+
18+
## Usage
19+
20+
Apps that need to prepare a hostname for usage in protocols are likely to only need the top-level function `domain_to_ascii_cow` with `AsciiDenyList::URL` as the second argument. Note that this rejects IPv6 addresses, so before this, you need to check if the first byte of the input is `b'['` and, if it is, treat the input as an IPv6 address instead.
21+
22+
Apps that need to display host names to the user should use `uts46::Uts46::to_user_interface`. The _ToUnicode_ operation is rarely appropriate for direct application usage.
23+
24+
## Cargo features
25+
26+
* `alloc` - For future proofing. Currently always required. Currently, the crate internal may allocate heap but for typical inputs do not allocate on the heap (apart from the output `String` when applicable).
27+
* `compiled_data` - For future proofing. Currently always required. (Passed through to ICU4X.)
28+
* `std` - Adds `impl std::error::Error for Errors {}` (and implies `alloc`).
29+
* By default, all of the above are enabled.
30+
31+
## Alternative Unicode back ends
32+
33+
By default, `idna` uses [ICU4X](https://github.com/unicode-org/icu4x/) as its Unicode back end. If you wish to opt for different tradeoffs between correctness, run-time performance, binary size, compile time, and MSRV, please see the [README of the latest version of the `idna_adapter` crate](https://docs.rs/crate/idna_adapter/latest) for how to opt into a different Unicode back end.
34+
35+
## Breaking changes since 0.5.0
36+
37+
* Stricter IDNA 2008 restrictions are no longer supported. Attempting to enable them panics immediately. UTS 46 allows all the names that IDNA 2008 allows, and when transitional processing is disabled, they resolve the same way. There are additional names that IDNA 2008 disallows but UTS 46 maps to names that IDNA 2008 allows (notably, input is mapped to fold-case output). UTS 46 also allows symbols that were allowed in IDNA 2003 as well as newer symbols that are allowed according to the same principle. (Earlier versions of this crate allowed rejecting such symbols. Rejecting characters that UTS 46 maps to IDNA 2008-permitted characters wasn't supported in earlier versions, either.)
38+
* `domain_to_ascii_strict` now performs the _CheckHyphens_ check (matching previous documentation).
39+
* The ContextJ rules are now implemented and always enabled, even when using the old deprecated API, so input that fails those rules is rejected.
40+
* The `Idna::to_ascii_inner` method has been removed. It didn't make sense as a public method, since callers were unable to figure out if there were errors. (A GitHub search found no callers for this method.)
41+
* Punycode labels whose decoding does not yield any non-ASCII characters are now treated as being in error.
42+
* When turning off default cargo features, the cargo feature `compiled_data` needs to be explicitly enabled.

idna/benches/all.rs

+56
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#![allow(deprecated)]
2+
13
#[macro_use]
24
extern crate bencher;
35
extern crate idna;
@@ -47,6 +49,51 @@ fn to_ascii_merged(bench: &mut Bencher) {
4749
bench.iter(|| config.to_ascii(black_box(encoded)));
4850
}
4951

52+
fn to_ascii_cow_plain(bench: &mut Bencher) {
53+
let encoded = "example.com".as_bytes();
54+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
55+
}
56+
57+
fn to_ascii_cow_hyphen(bench: &mut Bencher) {
58+
let encoded = "hyphenated-example.com".as_bytes();
59+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
60+
}
61+
62+
fn to_ascii_cow_leading_digit(bench: &mut Bencher) {
63+
let encoded = "1test.example".as_bytes();
64+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
65+
}
66+
67+
fn to_ascii_cow_unicode_mixed(bench: &mut Bencher) {
68+
let encoded = "مثال.example".as_bytes();
69+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
70+
}
71+
72+
fn to_ascii_cow_punycode_mixed(bench: &mut Bencher) {
73+
let encoded = "xn--mgbh0fb.example".as_bytes();
74+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
75+
}
76+
77+
fn to_ascii_cow_unicode_ltr(bench: &mut Bencher) {
78+
let encoded = "නම.උදාහරණ".as_bytes();
79+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
80+
}
81+
82+
fn to_ascii_cow_punycode_ltr(bench: &mut Bencher) {
83+
let encoded = "xn--r0co.xn--ozc8dl2c3bxd".as_bytes();
84+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
85+
}
86+
87+
fn to_ascii_cow_unicode_rtl(bench: &mut Bencher) {
88+
let encoded = "الاسم.مثال".as_bytes();
89+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
90+
}
91+
92+
fn to_ascii_cow_punycode_rtl(bench: &mut Bencher) {
93+
let encoded = "xn--mgba0b1dh.xn--mgbh0fb".as_bytes();
94+
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
95+
}
96+
5097
benchmark_group!(
5198
benches,
5299
to_unicode_puny_label,
@@ -56,5 +103,14 @@ benchmark_group!(
56103
to_ascii_already_puny_label,
57104
to_ascii_simple,
58105
to_ascii_merged,
106+
to_ascii_cow_plain,
107+
to_ascii_cow_hyphen,
108+
to_ascii_cow_leading_digit,
109+
to_ascii_cow_unicode_mixed,
110+
to_ascii_cow_punycode_mixed,
111+
to_ascii_cow_unicode_ltr,
112+
to_ascii_cow_punycode_ltr,
113+
to_ascii_cow_unicode_rtl,
114+
to_ascii_cow_punycode_rtl,
59115
);
60116
benchmark_main!(benches);

0 commit comments

Comments
 (0)