Skip to content

Commit 613a5c9

Browse files
committedMay 7, 2023
Auto merge of rust-lang#111222 - scottmcm:constify-is_ascii, r=thomcc
Constify `[u8]::is_ascii` (unstably) UTF-8 checking in `const fn`-stabilized back in 1.63 (rust-lang#97367), but apparently somehow ASCII checking was never const-ified, despite being simpler. New constness-tracking issue for `is_ascii`: rust-lang#111090 I noticed this working on `ascii::Char`: rust-lang#110998
2 parents 0dddad0 + c8c5a58 commit 613a5c9

File tree

6 files changed

+96
-15
lines changed

6 files changed

+96
-15
lines changed
 

‎library/core/src/array/ascii.rs

+14-1
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,23 @@ use crate::ascii;
44
impl<const N: usize> [u8; N] {
55
/// Converts this array of bytes into a array of ASCII characters,
66
/// or returns `None` if any of the characters is non-ASCII.
7+
///
8+
/// # Examples
9+
///
10+
/// ```
11+
/// #![feature(ascii_char)]
12+
/// #![feature(const_option)]
13+
///
14+
/// const HEX_DIGITS: [std::ascii::Char; 16] =
15+
/// *b"0123456789abcdef".as_ascii().unwrap();
16+
///
17+
/// assert_eq!(HEX_DIGITS[1].as_str(), "1");
18+
/// assert_eq!(HEX_DIGITS[10].as_str(), "a");
19+
/// ```
720
#[unstable(feature = "ascii_char", issue = "110998")]
821
#[must_use]
922
#[inline]
10-
pub fn as_ascii(&self) -> Option<&[ascii::Char; N]> {
23+
pub const fn as_ascii(&self) -> Option<&[ascii::Char; N]> {
1124
if self.is_ascii() {
1225
// SAFETY: Just checked that it's ASCII
1326
Some(unsafe { self.as_ascii_unchecked() })

‎library/core/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@
150150
#![feature(const_slice_from_raw_parts_mut)]
151151
#![feature(const_slice_from_ref)]
152152
#![feature(const_slice_index)]
153+
#![feature(const_slice_is_ascii)]
153154
#![feature(const_slice_ptr_len)]
154155
#![feature(const_slice_split_at_mut)]
155156
#![feature(const_str_from_utf8_unchecked_mut)]

‎library/core/src/slice/ascii.rs

+39-12
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@ use crate::ops;
1010
impl [u8] {
1111
/// Checks if all bytes in this slice are within the ASCII range.
1212
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
13+
#[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")]
1314
#[must_use]
1415
#[inline]
15-
pub fn is_ascii(&self) -> bool {
16+
pub const fn is_ascii(&self) -> bool {
1617
is_ascii(self)
1718
}
1819

@@ -21,7 +22,7 @@ impl [u8] {
2122
#[unstable(feature = "ascii_char", issue = "110998")]
2223
#[must_use]
2324
#[inline]
24-
pub fn as_ascii(&self) -> Option<&[ascii::Char]> {
25+
pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
2526
if self.is_ascii() {
2627
// SAFETY: Just checked that it's ASCII
2728
Some(unsafe { self.as_ascii_unchecked() })
@@ -262,11 +263,29 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
262263
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
263264
/// from `../str/mod.rs`, which does something similar for utf8 validation.
264265
#[inline]
265-
fn contains_nonascii(v: usize) -> bool {
266+
const fn contains_nonascii(v: usize) -> bool {
266267
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
267268
(NONASCII_MASK & v) != 0
268269
}
269270

271+
/// ASCII test *without* the chunk-at-a-time optimizations.
272+
///
273+
/// This is carefully structured to produce nice small code -- it's smaller in
274+
/// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you
275+
/// touch it, be sure to run (and update if needed) the assembly test.
276+
#[unstable(feature = "str_internals", issue = "none")]
277+
#[doc(hidden)]
278+
#[inline]
279+
pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
280+
while let [rest @ .., last] = bytes {
281+
if !last.is_ascii() {
282+
break;
283+
}
284+
bytes = rest;
285+
}
286+
bytes.is_empty()
287+
}
288+
270289
/// Optimized ASCII test that will use usize-at-a-time operations instead of
271290
/// byte-at-a-time operations (when possible).
272291
///
@@ -280,7 +299,7 @@ fn contains_nonascii(v: usize) -> bool {
280299
/// If any of these loads produces something for which `contains_nonascii`
281300
/// (above) returns true, then we know the answer is false.
282301
#[inline]
283-
fn is_ascii(s: &[u8]) -> bool {
302+
const fn is_ascii(s: &[u8]) -> bool {
284303
const USIZE_SIZE: usize = mem::size_of::<usize>();
285304

286305
let len = s.len();
@@ -292,7 +311,7 @@ fn is_ascii(s: &[u8]) -> bool {
292311
// We also do this for architectures where `size_of::<usize>()` isn't
293312
// sufficient alignment for `usize`, because it's a weird edge case.
294313
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
295-
return s.iter().all(|b| b.is_ascii());
314+
return is_ascii_simple(s);
296315
}
297316

298317
// We always read the first word unaligned, which means `align_offset` is
@@ -321,18 +340,26 @@ fn is_ascii(s: &[u8]) -> bool {
321340
// Paranoia check about alignment, since we're about to do a bunch of
322341
// unaligned loads. In practice this should be impossible barring a bug in
323342
// `align_offset` though.
324-
debug_assert_eq!(word_ptr.addr() % mem::align_of::<usize>(), 0);
343+
// While this method is allowed to spuriously fail in CTFE, if it doesn't
344+
// have alignment information it should have given a `usize::MAX` for
345+
// `align_offset` earlier, sending things through the scalar path instead of
346+
// this one, so this check should pass if it's reachable.
347+
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
325348

326349
// Read subsequent words until the last aligned word, excluding the last
327350
// aligned word by itself to be done in tail check later, to ensure that
328351
// tail is always one `usize` at most to extra branch `byte_pos == len`.
329352
while byte_pos < len - USIZE_SIZE {
330-
debug_assert!(
331-
// Sanity check that the read is in bounds
332-
(word_ptr.addr() + USIZE_SIZE) <= start.addr().wrapping_add(len) &&
333-
// And that our assumptions about `byte_pos` hold.
334-
(word_ptr.addr() - start.addr()) == byte_pos
335-
);
353+
// Sanity check that the read is in bounds
354+
debug_assert!(byte_pos + USIZE_SIZE <= len);
355+
// And that our assumptions about `byte_pos` hold.
356+
debug_assert!(matches!(
357+
word_ptr.cast::<u8>().guaranteed_eq(start.wrapping_add(byte_pos)),
358+
// These are from the same allocation, so will hopefully always be
359+
// known to match even in CTFE, but if it refuses to compare them
360+
// that's ok since it's just a debug check anyway.
361+
None | Some(true),
362+
));
336363

337364
// SAFETY: We know `word_ptr` is properly aligned (because of
338365
// `align_offset`), and we know that we have enough bytes between `word_ptr` and the end

‎library/core/src/slice/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ mod raw;
4444
mod rotate;
4545
mod specialize;
4646

47+
#[unstable(feature = "str_internals", issue = "none")]
48+
#[doc(hidden)]
49+
pub use ascii::is_ascii_simple;
50+
4751
#[stable(feature = "rust1", since = "1.0.0")]
4852
pub use iter::{Chunks, ChunksMut, Windows};
4953
#[stable(feature = "rust1", since = "1.0.0")]

‎library/core/src/str/mod.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -2358,9 +2358,10 @@ impl str {
23582358
/// assert!(!non_ascii.is_ascii());
23592359
/// ```
23602360
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
2361+
#[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")]
23612362
#[must_use]
23622363
#[inline]
2363-
pub fn is_ascii(&self) -> bool {
2364+
pub const fn is_ascii(&self) -> bool {
23642365
// We can treat each byte as character here: all multibyte characters
23652366
// start with a byte that is not in the ASCII range, so we will stop
23662367
// there already.
@@ -2372,7 +2373,7 @@ impl str {
23722373
#[unstable(feature = "ascii_char", issue = "110998")]
23732374
#[must_use]
23742375
#[inline]
2375-
pub fn as_ascii(&self) -> Option<&[ascii::Char]> {
2376+
pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
23762377
// Like in `is_ascii`, we can work on the bytes directly.
23772378
self.as_bytes().as_ascii()
23782379
}

‎tests/assembly/slice-is_ascii.rs

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// revisions: WIN LIN
2+
// [WIN] only-windows
3+
// [LIN] only-linux
4+
// assembly-output: emit-asm
5+
// compile-flags: --crate-type=lib -O -C llvm-args=-x86-asm-syntax=intel
6+
// min-llvm-version: 14
7+
// only-x86_64
8+
// ignore-sgx
9+
// ignore-debug
10+
11+
#![feature(str_internals)]
12+
13+
// CHECK-LABEL: is_ascii_simple_demo:
14+
#[no_mangle]
15+
pub fn is_ascii_simple_demo(bytes: &[u8]) -> bool {
16+
// Linux (System V): pointer is rdi; length is rsi
17+
// Windows: pointer is rcx; length is rdx.
18+
19+
// CHECK-NOT: mov
20+
// CHECK-NOT: test
21+
// CHECK-NOT: cmp
22+
23+
// CHECK: .[[LOOPHEAD:.+]]:
24+
// CHECK-NEXT: mov [[TEMP:.+]], [[LEN:rsi|rdx]]
25+
// CHECK-NEXT: sub [[LEN]], 1
26+
// CHECK-NEXT: jb .[[LOOPEXIT:.+]]
27+
// CHECK-NEXT: cmp byte ptr [{{rdi|rcx}} + [[TEMP]] - 1], 0
28+
// CHECK-NEXT: jns .[[LOOPHEAD]]
29+
30+
// CHECK-NEXT: .[[LOOPEXIT]]:
31+
// CHECK-NEXT: test [[TEMP]], [[TEMP]]
32+
// CHECK-NEXT: sete al
33+
// CHECK-NEXT: ret
34+
core::slice::is_ascii_simple(bytes)
35+
}

0 commit comments

Comments
 (0)
Please sign in to comment.