Skip to content

Commit 26ef195

Browse files
committed
rust-lang#66219 documented unsafe in core::str
1 parent 4679271 commit 26ef195

File tree

3 files changed

+55
-13
lines changed

3 files changed

+55
-13
lines changed

src/libcore/str/lossy.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ use crate::str as core_str;
33
use crate::fmt::{self, Write};
44
use crate::mem;
55

6-
// ignore-tidy-undocumented-unsafe
7-
86
/// Lossy UTF-8 string.
97
#[unstable(feature = "str_internals", issue = "0")]
108
pub struct Utf8Lossy {
@@ -17,6 +15,7 @@ impl Utf8Lossy {
1715
}
1816

1917
pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
18+
// SAFETY: both use the same memory layout, and utf8 correctness isn't required
2019
unsafe { mem::transmute(bytes) }
2120
}
2221

@@ -61,6 +60,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
6160
while i < self.source.len() {
6261
let i_ = i;
6362

63+
// SAFETY: 0 <= i < self.source.len()
6464
let byte = unsafe { *self.source.get_unchecked(i) };
6565
i += 1;
6666

@@ -70,6 +70,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
7070
let w = core_str::utf8_char_width(byte);
7171

7272
macro_rules! error { () => ({
73+
// SAFETY: we have checked up to i that source is valid utf8
7374
unsafe {
7475
let r = Utf8LossyChunk {
7576
valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
@@ -130,6 +131,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
130131
}
131132

132133
let r = Utf8LossyChunk {
134+
// SAFETY: we have checked that the entire source is valid utf8
133135
valid: unsafe { core_str::from_utf8_unchecked(self.source) },
134136
broken: &[],
135137
};

src/libcore/str/mod.rs

+35-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
// ignore-tidy-filelength
2-
// ignore-tidy-undocumented-unsafe
32

43
//! String manipulation.
54
//!
@@ -337,6 +336,7 @@ impl Utf8Error {
337336
#[stable(feature = "rust1", since = "1.0.0")]
338337
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
339338
run_utf8_validation(v)?;
339+
// SAFETY: just ran validation
340340
Ok(unsafe { from_utf8_unchecked(v) })
341341
}
342342

@@ -375,6 +375,7 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
375375
#[stable(feature = "str_mut_extras", since = "1.20.0")]
376376
pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
377377
run_utf8_validation(v)?;
378+
// SAFETY: just ran validation
378379
Ok(unsafe { from_utf8_unchecked_mut(v) })
379380
}
380381

@@ -567,7 +568,7 @@ impl<'a> Iterator for Chars<'a> {
567568
#[inline]
568569
fn next(&mut self) -> Option<char> {
569570
next_code_point(&mut self.iter).map(|ch| {
570-
// str invariant says `ch` is a valid Unicode Scalar Value
571+
// SAFETY: str invariant says `ch` is a valid Unicode Scalar Value
571572
unsafe {
572573
char::from_u32_unchecked(ch)
573574
}
@@ -616,7 +617,7 @@ impl<'a> DoubleEndedIterator for Chars<'a> {
616617
#[inline]
617618
fn next_back(&mut self) -> Option<char> {
618619
next_code_point_reverse(&mut self.iter).map(|ch| {
619-
// str invariant says `ch` is a valid Unicode Scalar Value
620+
// SAFETY: str invariant says `ch` is a valid Unicode Scalar Value
620621
unsafe {
621622
char::from_u32_unchecked(ch)
622623
}
@@ -648,6 +649,7 @@ impl<'a> Chars<'a> {
648649
#[stable(feature = "iter_to_slice", since = "1.4.0")]
649650
#[inline]
650651
pub fn as_str(&self) -> &'a str {
652+
// SAFETY: Chars is only made from a str, which guarantees the iter is valid utf8
651653
unsafe { from_utf8_unchecked(self.iter.as_slice()) }
652654
}
653655
}
@@ -1080,6 +1082,7 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> {
10801082
fn get_end(&mut self) -> Option<&'a str> {
10811083
if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) {
10821084
self.finished = true;
1085+
// SAFETY: self.start and self.end always lie on unicode boudaries
10831086
unsafe {
10841087
let string = self.matcher.haystack().get_unchecked(self.start..self.end);
10851088
Some(string)
@@ -1095,6 +1098,7 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> {
10951098

10961099
let haystack = self.matcher.haystack();
10971100
match self.matcher.next_match() {
1101+
// SAFETY: Searcher guarantees that a and b lie on unicode boundaries
10981102
Some((a, b)) => unsafe {
10991103
let elt = haystack.get_unchecked(self.start..a);
11001104
self.start = b;
@@ -1120,11 +1124,13 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> {
11201124

11211125
let haystack = self.matcher.haystack();
11221126
match self.matcher.next_match_back() {
1127+
// SAFETY: Searcher guarantees that a and b lie on unicode boundaries
11231128
Some((a, b)) => unsafe {
11241129
let elt = haystack.get_unchecked(b..self.end);
11251130
self.end = a;
11261131
Some(elt)
11271132
},
1133+
// SAFETY: self.start and self.end always lie on unicode boudaries
11281134
None => unsafe {
11291135
self.finished = true;
11301136
Some(haystack.get_unchecked(self.start..self.end))
@@ -1253,6 +1259,7 @@ where
12531259
impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> {
12541260
#[inline]
12551261
fn next(&mut self) -> Option<(usize, &'a str)> {
1262+
// SAFETY: Searcher guarantees that start and end lie on unicode boundaries
12561263
self.0.next_match().map(|(start, end)| unsafe {
12571264
(start, self.0.haystack().get_unchecked(start..end))
12581265
})
@@ -1262,6 +1269,7 @@ impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> {
12621269
fn next_back(&mut self) -> Option<(usize, &'a str)>
12631270
where P::Searcher: ReverseSearcher<'a>
12641271
{
1272+
// SAFETY: Searcher guarantees that start and end lie on unicode boundaries
12651273
self.0.next_match_back().map(|(start, end)| unsafe {
12661274
(start, self.0.haystack().get_unchecked(start..end))
12671275
})
@@ -1307,6 +1315,7 @@ where
13071315
impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> {
13081316
#[inline]
13091317
fn next(&mut self) -> Option<&'a str> {
1318+
// SAFETY: Searcher guarantees that start and end lie on unicode boundaries
13101319
self.0.next_match().map(|(a, b)| unsafe {
13111320
// Indices are known to be on utf8 boundaries
13121321
self.0.haystack().get_unchecked(a..b)
@@ -1317,6 +1326,7 @@ impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> {
13171326
fn next_back(&mut self) -> Option<&'a str>
13181327
where P::Searcher: ReverseSearcher<'a>
13191328
{
1329+
// SAFETY: Searcher guarantees that start and end lie on unicode boundaries
13201330
self.0.next_match_back().map(|(a, b)| unsafe {
13211331
// Indices are known to be on utf8 boundaries
13221332
self.0.haystack().get_unchecked(a..b)
@@ -1538,6 +1548,9 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
15381548
if align != usize::max_value() && align.wrapping_sub(index) % usize_bytes == 0 {
15391549
let ptr = v.as_ptr();
15401550
while index < blocks_end {
1551+
// SAFETY: since align - index and ascii_block_size are multiples of
1552+
// usize_bytes, ptr.add(index) is always aligned with a usize so we may cast
1553+
// directly to a const pointer.
15411554
unsafe {
15421555
let block = ptr.add(index) as *const usize;
15431556
// break if there is a nonascii byte
@@ -1760,6 +1773,7 @@ mod traits {
17601773
if self.start <= self.end &&
17611774
slice.is_char_boundary(self.start) &&
17621775
slice.is_char_boundary(self.end) {
1776+
// SAFETY: just checked that start and end are on a char boundary
17631777
Some(unsafe { self.get_unchecked(slice) })
17641778
} else {
17651779
None
@@ -1770,6 +1784,7 @@ mod traits {
17701784
if self.start <= self.end &&
17711785
slice.is_char_boundary(self.start) &&
17721786
slice.is_char_boundary(self.end) {
1787+
// SAFETY: just checked that start and end are on a char boundary
17731788
Some(unsafe { self.get_unchecked_mut(slice) })
17741789
} else {
17751790
None
@@ -1799,6 +1814,7 @@ mod traits {
17991814
if self.start <= self.end &&
18001815
slice.is_char_boundary(self.start) &&
18011816
slice.is_char_boundary(self.end) {
1817+
// SAFETY: just checked that start and end are on a char boundary
18021818
unsafe { self.get_unchecked_mut(slice) }
18031819
} else {
18041820
super::slice_error_fail(slice, self.start, self.end)
@@ -1827,6 +1843,7 @@ mod traits {
18271843
#[inline]
18281844
fn get(self, slice: &str) -> Option<&Self::Output> {
18291845
if slice.is_char_boundary(self.end) {
1846+
// SAFETY: just checked that end is on a char boundary
18301847
Some(unsafe { self.get_unchecked(slice) })
18311848
} else {
18321849
None
@@ -1835,6 +1852,7 @@ mod traits {
18351852
#[inline]
18361853
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
18371854
if slice.is_char_boundary(self.end) {
1855+
// SAFETY: just checked that end is on a char boundary
18381856
Some(unsafe { self.get_unchecked_mut(slice) })
18391857
} else {
18401858
None
@@ -1857,8 +1875,8 @@ mod traits {
18571875
}
18581876
#[inline]
18591877
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
1860-
// is_char_boundary checks that the index is in [0, .len()]
18611878
if slice.is_char_boundary(self.end) {
1879+
// SAFETY: just checked that end is on a char boundary
18621880
unsafe { self.get_unchecked_mut(slice) }
18631881
} else {
18641882
super::slice_error_fail(slice, 0, self.end)
@@ -1888,6 +1906,7 @@ mod traits {
18881906
#[inline]
18891907
fn get(self, slice: &str) -> Option<&Self::Output> {
18901908
if slice.is_char_boundary(self.start) {
1909+
// SAFETY: just checked that start is on a char boundary
18911910
Some(unsafe { self.get_unchecked(slice) })
18921911
} else {
18931912
None
@@ -1896,6 +1915,7 @@ mod traits {
18961915
#[inline]
18971916
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
18981917
if slice.is_char_boundary(self.start) {
1918+
// SAFETY: just checked that start is on a char boundary
18991919
Some(unsafe { self.get_unchecked_mut(slice) })
19001920
} else {
19011921
None
@@ -1920,8 +1940,8 @@ mod traits {
19201940
}
19211941
#[inline]
19221942
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
1923-
// is_char_boundary checks that the index is in [0, .len()]
19241943
if slice.is_char_boundary(self.start) {
1944+
// SAFETY: just checked that start is on a char boundary
19251945
unsafe { self.get_unchecked_mut(slice) }
19261946
} else {
19271947
super::slice_error_fail(slice, self.start, slice.len())
@@ -2167,7 +2187,6 @@ impl str {
21672187
/// ```
21682188
#[stable(feature = "rust1", since = "1.0.0")]
21692189
#[inline(always)]
2170-
// SAFETY: const sound because we transmute two types with the same layout
21712190
#[allow(unused_attributes)]
21722191
#[allow_internal_unstable(const_fn_union)]
21732192
pub const fn as_bytes(&self) -> &[u8] {
@@ -2176,6 +2195,7 @@ impl str {
21762195
str: &'a str,
21772196
slice: &'a [u8],
21782197
}
2198+
// SAFETY: const sound because we transmute two types with the same layout
21792199
unsafe { Slices { str: self }.slice }
21802200
}
21812201

@@ -2501,6 +2521,7 @@ impl str {
25012521
pub fn split_at(&self, mid: usize) -> (&str, &str) {
25022522
// is_char_boundary checks that the index is in [0, .len()]
25032523
if self.is_char_boundary(mid) {
2524+
// SAFETY: just checked that mid is on a char boundary
25042525
unsafe {
25052526
(self.get_unchecked(0..mid),
25062527
self.get_unchecked(mid..self.len()))
@@ -2548,6 +2569,7 @@ impl str {
25482569
if self.is_char_boundary(mid) {
25492570
let len = self.len();
25502571
let ptr = self.as_mut_ptr();
2572+
// SAFETY: just checked that mid is on a char boundary
25512573
unsafe {
25522574
(from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, mid)),
25532575
from_utf8_unchecked_mut(slice::from_raw_parts_mut(
@@ -3746,8 +3768,8 @@ impl str {
37463768
if let Some((_, b)) = matcher.next_reject_back() {
37473769
j = b;
37483770
}
3771+
// SAFETY: Searcher is known to return valid indices
37493772
unsafe {
3750-
// Searcher is known to return valid indices
37513773
self.get_unchecked(i..j)
37523774
}
37533775
}
@@ -3785,8 +3807,8 @@ impl str {
37853807
if let Some((a, _)) = matcher.next_reject() {
37863808
i = a;
37873809
}
3810+
// SAFETY: Searcher is known to return valid indices
37883811
unsafe {
3789-
// Searcher is known to return valid indices
37903812
self.get_unchecked(i..self.len())
37913813
}
37923814
}
@@ -3833,8 +3855,8 @@ impl str {
38333855
if let Some((_, b)) = matcher.next_reject_back() {
38343856
j = b;
38353857
}
3858+
// SAFETY: Searcher is known to return valid indices
38363859
unsafe {
3837-
// Searcher is known to return valid indices
38383860
self.get_unchecked(0..j)
38393861
}
38403862
}
@@ -4029,6 +4051,7 @@ impl str {
40294051
/// ```
40304052
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
40314053
pub fn make_ascii_uppercase(&mut self) {
4054+
// SAFETY: safe because we transmute two types with the same layout
40324055
let me = unsafe { self.as_bytes_mut() };
40334056
me.make_ascii_uppercase()
40344057
}
@@ -4054,6 +4077,7 @@ impl str {
40544077
/// ```
40554078
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
40564079
pub fn make_ascii_lowercase(&mut self) {
4080+
// SAFETY: safe because we transmute two types with the same layout
40574081
let me = unsafe { self.as_bytes_mut() };
40584082
me.make_ascii_lowercase()
40594083
}
@@ -4216,6 +4240,7 @@ impl Default for &str {
42164240
#[stable(feature = "default_mut_str", since = "1.28.0")]
42174241
impl Default for &mut str {
42184242
/// Creates an empty mutable str
4243+
// SAFETY: str is guranteed to be utf8
42194244
fn default() -> Self { unsafe { from_utf8_unchecked_mut(&mut []) } }
42204245
}
42214246

@@ -4270,6 +4295,7 @@ impl_fn_for_zst! {
42704295

42714296
#[derive(Clone)]
42724297
struct UnsafeBytesToStr impl<'a> Fn = |bytes: &'a [u8]| -> &'a str {
4298+
// SAFETY: not safe
42734299
unsafe { from_utf8_unchecked(bytes) }
42744300
};
42754301
}

src/libcore/str/pattern.rs

+16-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
//! For more details, see the traits [`Pattern`], [`Searcher`],
44
//! [`ReverseSearcher`], and [`DoubleEndedSearcher`].
55
6-
// ignore-tidy-undocumented-unsafe
7-
86
#![unstable(feature = "pattern",
97
reason = "API not fully fleshed out and ready to be stabilized",
108
issue = "27721")]
@@ -276,6 +274,13 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
276274
#[inline]
277275
fn next(&mut self) -> SearchStep {
278276
let old_finger = self.finger;
277+
// 1. self.finger and self.finger_back are kept on unicode boundaries (this is invariant)
278+
// 2. self.finger >= 0 since it starts at 0 and only increases
279+
// 3. self.finger < self.finger_back because otherwise the char iter would return
280+
// SearchStep::Done
281+
// 4. self.finger comes before the end of the haystack because self.finger_back starts at
282+
// the end and only decreases
283+
// SAFETY: 1-4 guarantee safety of get_unchecked
279284
let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) };
280285
let mut iter = slice.chars();
281286
let old_len = iter.iter.len();
@@ -303,6 +308,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
303308
return None;
304309
};
305310
// the last byte of the utf8 encoded needle
311+
// SAFETY: we have an invariant that utf8_size < 5
306312
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
307313
if let Some(index) = memchr::memchr(last_byte, bytes) {
308314
// The new finger is the index of the byte we found,
@@ -346,6 +352,13 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
346352
#[inline]
347353
fn next_back(&mut self) -> SearchStep {
348354
let old_finger = self.finger_back;
355+
// 1. self.finger and self.old_finger are kept on unicode boundaries (this is invariant)
356+
// 2. self.finger >= 0 since it starts at 0 and only increases
357+
// 3. self.finger < self.finger_back because otherwise the char iter would return
358+
// SearchStep::Done
359+
// 4. self.finger comes before the end of the haystack because self.finger_back starts at
360+
// the end and only decreases
361+
// SAFETY: 1-4 guarantee safety of get_unchecked
349362
let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) };
350363
let mut iter = slice.chars();
351364
let old_len = iter.iter.len();
@@ -373,6 +386,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
373386
return None;
374387
};
375388
// the last byte of the utf8 encoded needle
389+
// SAFETY: we have an invariant that utf8_size < 5
376390
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
377391
if let Some(index) = memchr::memrchr(last_byte, bytes) {
378392
// we searched a slice that was offset by self.finger,

0 commit comments

Comments
 (0)