core/slice/
ascii.rs

1//! Operations on ASCII `[u8]`.
2
3use core::ascii::EscapeDefault;
4
5use crate::fmt::{self, Write};
6#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
7use crate::intrinsics::const_eval_select;
8use crate::{ascii, iter, ops};
9
10impl [u8] {
11    /// Checks if all bytes in this slice are within the ASCII range.
12    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
13    #[rustc_const_stable(feature = "const_slice_is_ascii", since = "1.74.0")]
14    #[must_use]
15    #[inline]
16    pub const fn is_ascii(&self) -> bool {
17        is_ascii(self)
18    }
19
20    /// If this slice [`is_ascii`](Self::is_ascii), returns it as a slice of
21    /// [ASCII characters](`ascii::Char`), otherwise returns `None`.
22    #[unstable(feature = "ascii_char", issue = "110998")]
23    #[must_use]
24    #[inline]
25    pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
26        if self.is_ascii() {
27            // SAFETY: Just checked that it's ASCII
28            Some(unsafe { self.as_ascii_unchecked() })
29        } else {
30            None
31        }
32    }
33
34    /// Converts this slice of bytes into a slice of ASCII characters,
35    /// without checking whether they're valid.
36    ///
37    /// # Safety
38    ///
39    /// Every byte in the slice must be in `0..=127`, or else this is UB.
40    #[unstable(feature = "ascii_char", issue = "110998")]
41    #[must_use]
42    #[inline]
43    pub const unsafe fn as_ascii_unchecked(&self) -> &[ascii::Char] {
44        let byte_ptr: *const [u8] = self;
45        let ascii_ptr = byte_ptr as *const [ascii::Char];
46        // SAFETY: The caller promised all the bytes are ASCII
47        unsafe { &*ascii_ptr }
48    }
49
50    /// Checks that two slices are an ASCII case-insensitive match.
51    ///
52    /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
53    /// but without allocating and copying temporaries.
54    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
55    #[rustc_const_unstable(feature = "const_eq_ignore_ascii_case", issue = "131719")]
56    #[must_use]
57    #[inline]
58    pub const fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
59        if self.len() != other.len() {
60            return false;
61        }
62
63        // FIXME(const-hack): This implementation can be reverted when
64        // `core::iter::zip` is allowed in const. The original implementation:
65        //  self.len() == other.len() && iter::zip(self, other).all(|(a, b)| a.eq_ignore_ascii_case(b))
66        let mut a = self;
67        let mut b = other;
68
69        while let ([first_a, rest_a @ ..], [first_b, rest_b @ ..]) = (a, b) {
70            if first_a.eq_ignore_ascii_case(&first_b) {
71                a = rest_a;
72                b = rest_b;
73            } else {
74                return false;
75            }
76        }
77
78        true
79    }
80
81    /// Converts this slice to its ASCII upper case equivalent in-place.
82    ///
83    /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
84    /// but non-ASCII letters are unchanged.
85    ///
86    /// To return a new uppercased value without modifying the existing one, use
87    /// [`to_ascii_uppercase`].
88    ///
89    /// [`to_ascii_uppercase`]: #method.to_ascii_uppercase
90    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
91    #[rustc_const_stable(feature = "const_make_ascii", since = "1.84.0")]
92    #[inline]
93    pub const fn make_ascii_uppercase(&mut self) {
94        // FIXME(const-hack): We would like to simply iterate using `for` loops but this isn't currently allowed in constant expressions.
95        let mut i = 0;
96        while i < self.len() {
97            let byte = &mut self[i];
98            byte.make_ascii_uppercase();
99            i += 1;
100        }
101    }
102
103    /// Converts this slice to its ASCII lower case equivalent in-place.
104    ///
105    /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
106    /// but non-ASCII letters are unchanged.
107    ///
108    /// To return a new lowercased value without modifying the existing one, use
109    /// [`to_ascii_lowercase`].
110    ///
111    /// [`to_ascii_lowercase`]: #method.to_ascii_lowercase
112    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
113    #[rustc_const_stable(feature = "const_make_ascii", since = "1.84.0")]
114    #[inline]
115    pub const fn make_ascii_lowercase(&mut self) {
116        // FIXME(const-hack): We would like to simply iterate using `for` loops but this isn't currently allowed in constant expressions.
117        let mut i = 0;
118        while i < self.len() {
119            let byte = &mut self[i];
120            byte.make_ascii_lowercase();
121            i += 1;
122        }
123    }
124
125    /// Returns an iterator that produces an escaped version of this slice,
126    /// treating it as an ASCII string.
127    ///
128    /// # Examples
129    ///
130    /// ```
131    ///
132    /// let s = b"0\t\r\n'\"\\\x9d";
133    /// let escaped = s.escape_ascii().to_string();
134    /// assert_eq!(escaped, "0\\t\\r\\n\\'\\\"\\\\\\x9d");
135    /// ```
136    #[must_use = "this returns the escaped bytes as an iterator, \
137                  without modifying the original"]
138    #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
139    pub fn escape_ascii(&self) -> EscapeAscii<'_> {
140        EscapeAscii { inner: self.iter().flat_map(EscapeByte) }
141    }
142
143    /// Returns a byte slice with leading ASCII whitespace bytes removed.
144    ///
145    /// 'Whitespace' refers to the definition used by
146    /// [`u8::is_ascii_whitespace`].
147    ///
148    /// # Examples
149    ///
150    /// ```
151    /// assert_eq!(b" \t hello world\n".trim_ascii_start(), b"hello world\n");
152    /// assert_eq!(b"  ".trim_ascii_start(), b"");
153    /// assert_eq!(b"".trim_ascii_start(), b"");
154    /// ```
155    #[stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
156    #[rustc_const_stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
157    #[inline]
158    pub const fn trim_ascii_start(&self) -> &[u8] {
159        let mut bytes = self;
160        // Note: A pattern matching based approach (instead of indexing) allows
161        // making the function const.
162        while let [first, rest @ ..] = bytes {
163            if first.is_ascii_whitespace() {
164                bytes = rest;
165            } else {
166                break;
167            }
168        }
169        bytes
170    }
171
172    /// Returns a byte slice with trailing ASCII whitespace bytes removed.
173    ///
174    /// 'Whitespace' refers to the definition used by
175    /// [`u8::is_ascii_whitespace`].
176    ///
177    /// # Examples
178    ///
179    /// ```
180    /// assert_eq!(b"\r hello world\n ".trim_ascii_end(), b"\r hello world");
181    /// assert_eq!(b"  ".trim_ascii_end(), b"");
182    /// assert_eq!(b"".trim_ascii_end(), b"");
183    /// ```
184    #[stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
185    #[rustc_const_stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
186    #[inline]
187    pub const fn trim_ascii_end(&self) -> &[u8] {
188        let mut bytes = self;
189        // Note: A pattern matching based approach (instead of indexing) allows
190        // making the function const.
191        while let [rest @ .., last] = bytes {
192            if last.is_ascii_whitespace() {
193                bytes = rest;
194            } else {
195                break;
196            }
197        }
198        bytes
199    }
200
201    /// Returns a byte slice with leading and trailing ASCII whitespace bytes
202    /// removed.
203    ///
204    /// 'Whitespace' refers to the definition used by
205    /// [`u8::is_ascii_whitespace`].
206    ///
207    /// # Examples
208    ///
209    /// ```
210    /// assert_eq!(b"\r hello world\n ".trim_ascii(), b"hello world");
211    /// assert_eq!(b"  ".trim_ascii(), b"");
212    /// assert_eq!(b"".trim_ascii(), b"");
213    /// ```
214    #[stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
215    #[rustc_const_stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
216    #[inline]
217    pub const fn trim_ascii(&self) -> &[u8] {
218        self.trim_ascii_start().trim_ascii_end()
219    }
220}
221
222impl_fn_for_zst! {
223    #[derive(Clone)]
224    struct EscapeByte impl Fn = |byte: &u8| -> ascii::EscapeDefault {
225        ascii::escape_default(*byte)
226    };
227}
228
229/// An iterator over the escaped version of a byte slice.
230///
231/// This `struct` is created by the [`slice::escape_ascii`] method. See its
232/// documentation for more information.
233#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
234#[derive(Clone)]
235#[must_use = "iterators are lazy and do nothing unless consumed"]
236pub struct EscapeAscii<'a> {
237    inner: iter::FlatMap<super::Iter<'a, u8>, ascii::EscapeDefault, EscapeByte>,
238}
239
240#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
241impl<'a> iter::Iterator for EscapeAscii<'a> {
242    type Item = u8;
243    #[inline]
244    fn next(&mut self) -> Option<u8> {
245        self.inner.next()
246    }
247    #[inline]
248    fn size_hint(&self) -> (usize, Option<usize>) {
249        self.inner.size_hint()
250    }
251    #[inline]
252    fn try_fold<Acc, Fold, R>(&mut self, init: Acc, fold: Fold) -> R
253    where
254        Fold: FnMut(Acc, Self::Item) -> R,
255        R: ops::Try<Output = Acc>,
256    {
257        self.inner.try_fold(init, fold)
258    }
259    #[inline]
260    fn fold<Acc, Fold>(self, init: Acc, fold: Fold) -> Acc
261    where
262        Fold: FnMut(Acc, Self::Item) -> Acc,
263    {
264        self.inner.fold(init, fold)
265    }
266    #[inline]
267    fn last(mut self) -> Option<u8> {
268        self.next_back()
269    }
270}
271
272#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
273impl<'a> iter::DoubleEndedIterator for EscapeAscii<'a> {
274    fn next_back(&mut self) -> Option<u8> {
275        self.inner.next_back()
276    }
277}
278#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
279impl<'a> iter::FusedIterator for EscapeAscii<'a> {}
280#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
281impl<'a> fmt::Display for EscapeAscii<'a> {
282    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283        // disassemble iterator, including front/back parts of flatmap in case it has been partially consumed
284        let (front, slice, back) = self.clone().inner.into_parts();
285        let front = front.unwrap_or(EscapeDefault::empty());
286        let mut bytes = slice.unwrap_or_default().as_slice();
287        let back = back.unwrap_or(EscapeDefault::empty());
288
289        // usually empty, so the formatter won't have to do any work
290        for byte in front {
291            f.write_char(byte as char)?;
292        }
293
294        fn needs_escape(b: u8) -> bool {
295            b > 0x7E || b < 0x20 || b == b'\\' || b == b'\'' || b == b'"'
296        }
297
298        while bytes.len() > 0 {
299            // fast path for the printable, non-escaped subset of ascii
300            let prefix = bytes.iter().take_while(|&&b| !needs_escape(b)).count();
301            // SAFETY: prefix length was derived by counting bytes in the same splice, so it's in-bounds
302            let (prefix, remainder) = unsafe { bytes.split_at_unchecked(prefix) };
303            // SAFETY: prefix is a valid utf8 sequence, as it's a subset of ASCII
304            let prefix = unsafe { crate::str::from_utf8_unchecked(prefix) };
305
306            f.write_str(prefix)?; // the fast part
307
308            bytes = remainder;
309
310            if let Some(&b) = bytes.first() {
311                // guaranteed to be non-empty, better to write it as a str
312                f.write_str(ascii::escape_default(b).as_str())?;
313                bytes = &bytes[1..];
314            }
315        }
316
317        // also usually empty
318        for byte in back {
319            f.write_char(byte as char)?;
320        }
321        Ok(())
322    }
323}
324#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
325impl<'a> fmt::Debug for EscapeAscii<'a> {
326    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
327        f.debug_struct("EscapeAscii").finish_non_exhaustive()
328    }
329}
330
331/// ASCII test *without* the chunk-at-a-time optimizations.
332///
333/// This is carefully structured to produce nice small code -- it's smaller in
334/// `-O` than what the "obvious" ways produces under `-C opt-level=s`.  If you
335/// touch it, be sure to run (and update if needed) the assembly test.
336#[unstable(feature = "str_internals", issue = "none")]
337#[doc(hidden)]
338#[inline]
339pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
340    while let [rest @ .., last] = bytes {
341        if !last.is_ascii() {
342            break;
343        }
344        bytes = rest;
345    }
346    bytes.is_empty()
347}
348
349/// Optimized ASCII test that will use usize-at-a-time operations instead of
350/// byte-at-a-time operations (when possible).
351///
352/// The algorithm we use here is pretty simple. If `s` is too short, we just
353/// check each byte and be done with it. Otherwise:
354///
355/// - Read the first word with an unaligned load.
356/// - Align the pointer, read subsequent words until end with aligned loads.
357/// - Read the last `usize` from `s` with an unaligned load.
358///
359/// If any of these loads produces something for which `contains_nonascii`
360/// (above) returns true, then we know the answer is false.
361#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
362#[inline]
363#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
364const fn is_ascii(s: &[u8]) -> bool {
365    // The runtime version behaves the same as the compiletime version, it's
366    // just more optimized.
367    const_eval_select!(
368        @capture { s: &[u8] } -> bool:
369        if const {
370            is_ascii_simple(s)
371        } else {
372            /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
373            /// from `../str/mod.rs`, which does something similar for utf8 validation.
374            const fn contains_nonascii(v: usize) -> bool {
375                const NONASCII_MASK: usize = usize::repeat_u8(0x80);
376                (NONASCII_MASK & v) != 0
377            }
378
379            const USIZE_SIZE: usize = size_of::<usize>();
380
381            let len = s.len();
382            let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
383
384            // If we wouldn't gain anything from the word-at-a-time implementation, fall
385            // back to a scalar loop.
386            //
387            // We also do this for architectures where `size_of::<usize>()` isn't
388            // sufficient alignment for `usize`, because it's a weird edge case.
389            if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
390                return is_ascii_simple(s);
391            }
392
393            // We always read the first word unaligned, which means `align_offset` is
394            // 0, we'd read the same value again for the aligned read.
395            let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset };
396
397            let start = s.as_ptr();
398            // SAFETY: We verify `len < USIZE_SIZE` above.
399            let first_word = unsafe { (start as *const usize).read_unaligned() };
400
401            if contains_nonascii(first_word) {
402                return false;
403            }
404            // We checked this above, somewhat implicitly. Note that `offset_to_aligned`
405            // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked
406            // above.
407            debug_assert!(offset_to_aligned <= len);
408
409            // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the
410            // middle chunk of the slice.
411            let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize };
412
413            // `byte_pos` is the byte index of `word_ptr`, used for loop end checks.
414            let mut byte_pos = offset_to_aligned;
415
416            // Paranoia check about alignment, since we're about to do a bunch of
417            // unaligned loads. In practice this should be impossible barring a bug in
418            // `align_offset` though.
419            // While this method is allowed to spuriously fail in CTFE, if it doesn't
420            // have alignment information it should have given a `usize::MAX` for
421            // `align_offset` earlier, sending things through the scalar path instead of
422            // this one, so this check should pass if it's reachable.
423            debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
424
425            // Read subsequent words until the last aligned word, excluding the last
426            // aligned word by itself to be done in tail check later, to ensure that
427            // tail is always one `usize` at most to extra branch `byte_pos == len`.
428            while byte_pos < len - USIZE_SIZE {
429                // Sanity check that the read is in bounds
430                debug_assert!(byte_pos + USIZE_SIZE <= len);
431                // And that our assumptions about `byte_pos` hold.
432                debug_assert!(word_ptr.cast::<u8>() == start.wrapping_add(byte_pos));
433
434                // SAFETY: We know `word_ptr` is properly aligned (because of
435                // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
436                let word = unsafe { word_ptr.read() };
437                if contains_nonascii(word) {
438                    return false;
439                }
440
441                byte_pos += USIZE_SIZE;
442                // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that
443                // after this `add`, `word_ptr` will be at most one-past-the-end.
444                word_ptr = unsafe { word_ptr.add(1) };
445            }
446
447            // Sanity check to ensure there really is only one `usize` left. This should
448            // be guaranteed by our loop condition.
449            debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE);
450
451            // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
452            let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() };
453
454            !contains_nonascii(last_word)
455        }
456    )
457}
458
459/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
460/// platforms.
461///
462/// Other platforms are not likely to benefit from this code structure, so they
463/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
464#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
465#[inline]
466const fn is_ascii(bytes: &[u8]) -> bool {
467    // Process chunks of 32 bytes at a time in the fast path to enable
468    // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
469    // can be OR'd together and then the resulting vector can be tested for
470    // non-ASCII bytes.
471    const CHUNK_SIZE: usize = 32;
472
473    let mut i = 0;
474
475    while i + CHUNK_SIZE <= bytes.len() {
476        let chunk_end = i + CHUNK_SIZE;
477
478        // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
479        // creates a mask from the most significant bit of each byte.
480        // ASCII bytes are less than 128 (0x80), so their most significant
481        // bit is unset.
482        let mut count = 0;
483        while i < chunk_end {
484            count += bytes[i].is_ascii() as u8;
485            i += 1;
486        }
487
488        // All bytes should be <= 127 so count is equal to chunk size.
489        if count != CHUNK_SIZE as u8 {
490            return false;
491        }
492    }
493
494    // Process the remaining `bytes.len() % N` bytes.
495    let mut is_ascii = true;
496    while i < bytes.len() {
497        is_ascii &= bytes[i].is_ascii();
498        i += 1;
499    }
500
501    is_ascii
502}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy