core/slice/
ascii.rs

1//! Operations on ASCII `[u8]`.
2
3use core::ascii::EscapeDefault;
4
5use crate::fmt::{self, Write};
6#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
7use crate::intrinsics::const_eval_select;
8use crate::{ascii, iter, ops};
9
10impl [u8] {
11    /// Checks if all bytes in this slice are within the ASCII range.
12    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
13    #[rustc_const_stable(feature = "const_slice_is_ascii", since = "1.74.0")]
14    #[must_use]
15    #[inline]
16    pub const fn is_ascii(&self) -> bool {
17        is_ascii(self)
18    }
19
20    /// If this slice [`is_ascii`](Self::is_ascii), returns it as a slice of
21    /// [ASCII characters](`ascii::Char`), otherwise returns `None`.
22    #[unstable(feature = "ascii_char", issue = "110998")]
23    #[must_use]
24    #[inline]
25    pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
26        if self.is_ascii() {
27            // SAFETY: Just checked that it's ASCII
28            Some(unsafe { self.as_ascii_unchecked() })
29        } else {
30            None
31        }
32    }
33
34    /// Converts this slice of bytes into a slice of ASCII characters,
35    /// without checking whether they're valid.
36    ///
37    /// # Safety
38    ///
39    /// Every byte in the slice must be in `0..=127`, or else this is UB.
40    #[unstable(feature = "ascii_char", issue = "110998")]
41    #[must_use]
42    #[inline]
43    pub const unsafe fn as_ascii_unchecked(&self) -> &[ascii::Char] {
44        let byte_ptr: *const [u8] = self;
45        let ascii_ptr = byte_ptr as *const [ascii::Char];
46        // SAFETY: The caller promised all the bytes are ASCII
47        unsafe { &*ascii_ptr }
48    }
49
50    /// Checks that two slices are an ASCII case-insensitive match.
51    ///
52    /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
53    /// but without allocating and copying temporaries.
54    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
55    #[rustc_const_stable(feature = "const_eq_ignore_ascii_case", since = "1.89.0")]
56    #[must_use]
57    #[inline]
58    pub const fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
59        if self.len() != other.len() {
60            return false;
61        }
62
63        // FIXME(const-hack): This implementation can be reverted when
64        // `core::iter::zip` is allowed in const. The original implementation:
65        //  self.len() == other.len() && iter::zip(self, other).all(|(a, b)| a.eq_ignore_ascii_case(b))
66        let mut a = self;
67        let mut b = other;
68
69        while let ([first_a, rest_a @ ..], [first_b, rest_b @ ..]) = (a, b) {
70            if first_a.eq_ignore_ascii_case(&first_b) {
71                a = rest_a;
72                b = rest_b;
73            } else {
74                return false;
75            }
76        }
77
78        true
79    }
80
81    /// Converts this slice to its ASCII upper case equivalent in-place.
82    ///
83    /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
84    /// but non-ASCII letters are unchanged.
85    ///
86    /// To return a new uppercased value without modifying the existing one, use
87    /// [`to_ascii_uppercase`].
88    ///
89    /// [`to_ascii_uppercase`]: #method.to_ascii_uppercase
90    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
91    #[rustc_const_stable(feature = "const_make_ascii", since = "1.84.0")]
92    #[inline]
93    pub const fn make_ascii_uppercase(&mut self) {
94        // FIXME(const-hack): We would like to simply iterate using `for` loops but this isn't currently allowed in constant expressions.
95        let mut i = 0;
96        while i < self.len() {
97            let byte = &mut self[i];
98            byte.make_ascii_uppercase();
99            i += 1;
100        }
101    }
102
103    /// Converts this slice to its ASCII lower case equivalent in-place.
104    ///
105    /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
106    /// but non-ASCII letters are unchanged.
107    ///
108    /// To return a new lowercased value without modifying the existing one, use
109    /// [`to_ascii_lowercase`].
110    ///
111    /// [`to_ascii_lowercase`]: #method.to_ascii_lowercase
112    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
113    #[rustc_const_stable(feature = "const_make_ascii", since = "1.84.0")]
114    #[inline]
115    pub const fn make_ascii_lowercase(&mut self) {
116        // FIXME(const-hack): We would like to simply iterate using `for` loops but this isn't currently allowed in constant expressions.
117        let mut i = 0;
118        while i < self.len() {
119            let byte = &mut self[i];
120            byte.make_ascii_lowercase();
121            i += 1;
122        }
123    }
124
125    /// Returns an iterator that produces an escaped version of this slice,
126    /// treating it as an ASCII string.
127    ///
128    /// # Examples
129    ///
130    /// ```
131    /// let s = b"0\t\r\n'\"\\\x9d";
132    /// let escaped = s.escape_ascii().to_string();
133    /// assert_eq!(escaped, "0\\t\\r\\n\\'\\\"\\\\\\x9d");
134    /// ```
135    #[must_use = "this returns the escaped bytes as an iterator, \
136                  without modifying the original"]
137    #[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
138    pub fn escape_ascii(&self) -> EscapeAscii<'_> {
139        EscapeAscii { inner: self.iter().flat_map(EscapeByte) }
140    }
141
142    /// Returns a byte slice with leading ASCII whitespace bytes removed.
143    ///
144    /// 'Whitespace' refers to the definition used by
145    /// [`u8::is_ascii_whitespace`].
146    ///
147    /// # Examples
148    ///
149    /// ```
150    /// assert_eq!(b" \t hello world\n".trim_ascii_start(), b"hello world\n");
151    /// assert_eq!(b"  ".trim_ascii_start(), b"");
152    /// assert_eq!(b"".trim_ascii_start(), b"");
153    /// ```
154    #[stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
155    #[rustc_const_stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
156    #[inline]
157    pub const fn trim_ascii_start(&self) -> &[u8] {
158        let mut bytes = self;
159        // Note: A pattern matching based approach (instead of indexing) allows
160        // making the function const.
161        while let [first, rest @ ..] = bytes {
162            if first.is_ascii_whitespace() {
163                bytes = rest;
164            } else {
165                break;
166            }
167        }
168        bytes
169    }
170
171    /// Returns a byte slice with trailing ASCII whitespace bytes removed.
172    ///
173    /// 'Whitespace' refers to the definition used by
174    /// [`u8::is_ascii_whitespace`].
175    ///
176    /// # Examples
177    ///
178    /// ```
179    /// assert_eq!(b"\r hello world\n ".trim_ascii_end(), b"\r hello world");
180    /// assert_eq!(b"  ".trim_ascii_end(), b"");
181    /// assert_eq!(b"".trim_ascii_end(), b"");
182    /// ```
183    #[stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
184    #[rustc_const_stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
185    #[inline]
186    pub const fn trim_ascii_end(&self) -> &[u8] {
187        let mut bytes = self;
188        // Note: A pattern matching based approach (instead of indexing) allows
189        // making the function const.
190        while let [rest @ .., last] = bytes {
191            if last.is_ascii_whitespace() {
192                bytes = rest;
193            } else {
194                break;
195            }
196        }
197        bytes
198    }
199
200    /// Returns a byte slice with leading and trailing ASCII whitespace bytes
201    /// removed.
202    ///
203    /// 'Whitespace' refers to the definition used by
204    /// [`u8::is_ascii_whitespace`].
205    ///
206    /// # Examples
207    ///
208    /// ```
209    /// assert_eq!(b"\r hello world\n ".trim_ascii(), b"hello world");
210    /// assert_eq!(b"  ".trim_ascii(), b"");
211    /// assert_eq!(b"".trim_ascii(), b"");
212    /// ```
213    #[stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
214    #[rustc_const_stable(feature = "byte_slice_trim_ascii", since = "1.80.0")]
215    #[inline]
216    pub const fn trim_ascii(&self) -> &[u8] {
217        self.trim_ascii_start().trim_ascii_end()
218    }
219}
220
221impl_fn_for_zst! {
222    #[derive(Clone)]
223    struct EscapeByte impl Fn = |byte: &u8| -> ascii::EscapeDefault {
224        ascii::escape_default(*byte)
225    };
226}
227
228/// An iterator over the escaped version of a byte slice.
229///
230/// This `struct` is created by the [`slice::escape_ascii`] method. See its
231/// documentation for more information.
232#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
233#[derive(Clone)]
234#[must_use = "iterators are lazy and do nothing unless consumed"]
235pub struct EscapeAscii<'a> {
236    inner: iter::FlatMap<super::Iter<'a, u8>, ascii::EscapeDefault, EscapeByte>,
237}
238
239#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
240impl<'a> iter::Iterator for EscapeAscii<'a> {
241    type Item = u8;
242    #[inline]
243    fn next(&mut self) -> Option<u8> {
244        self.inner.next()
245    }
246    #[inline]
247    fn size_hint(&self) -> (usize, Option<usize>) {
248        self.inner.size_hint()
249    }
250    #[inline]
251    fn try_fold<Acc, Fold, R>(&mut self, init: Acc, fold: Fold) -> R
252    where
253        Fold: FnMut(Acc, Self::Item) -> R,
254        R: ops::Try<Output = Acc>,
255    {
256        self.inner.try_fold(init, fold)
257    }
258    #[inline]
259    fn fold<Acc, Fold>(self, init: Acc, fold: Fold) -> Acc
260    where
261        Fold: FnMut(Acc, Self::Item) -> Acc,
262    {
263        self.inner.fold(init, fold)
264    }
265    #[inline]
266    fn last(mut self) -> Option<u8> {
267        self.next_back()
268    }
269}
270
271#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
272impl<'a> iter::DoubleEndedIterator for EscapeAscii<'a> {
273    fn next_back(&mut self) -> Option<u8> {
274        self.inner.next_back()
275    }
276}
277#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
278impl<'a> iter::FusedIterator for EscapeAscii<'a> {}
279#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
280impl<'a> fmt::Display for EscapeAscii<'a> {
281    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
282        // disassemble iterator, including front/back parts of flatmap in case it has been partially consumed
283        let (front, slice, back) = self.clone().inner.into_parts();
284        let front = front.unwrap_or(EscapeDefault::empty());
285        let mut bytes = slice.unwrap_or_default().as_slice();
286        let back = back.unwrap_or(EscapeDefault::empty());
287
288        // usually empty, so the formatter won't have to do any work
289        for byte in front {
290            f.write_char(byte as char)?;
291        }
292
293        fn needs_escape(b: u8) -> bool {
294            b > 0x7E || b < 0x20 || b == b'\\' || b == b'\'' || b == b'"'
295        }
296
297        while bytes.len() > 0 {
298            // fast path for the printable, non-escaped subset of ascii
299            let prefix = bytes.iter().take_while(|&&b| !needs_escape(b)).count();
300            // SAFETY: prefix length was derived by counting bytes in the same splice, so it's in-bounds
301            let (prefix, remainder) = unsafe { bytes.split_at_unchecked(prefix) };
302            // SAFETY: prefix is a valid utf8 sequence, as it's a subset of ASCII
303            let prefix = unsafe { crate::str::from_utf8_unchecked(prefix) };
304
305            f.write_str(prefix)?; // the fast part
306
307            bytes = remainder;
308
309            if let Some(&b) = bytes.first() {
310                // guaranteed to be non-empty, better to write it as a str
311                fmt::Display::fmt(&ascii::escape_default(b), f)?;
312                bytes = &bytes[1..];
313            }
314        }
315
316        // also usually empty
317        for byte in back {
318            f.write_char(byte as char)?;
319        }
320        Ok(())
321    }
322}
323#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
324impl<'a> fmt::Debug for EscapeAscii<'a> {
325    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326        f.debug_struct("EscapeAscii").finish_non_exhaustive()
327    }
328}
329
330/// ASCII test *without* the chunk-at-a-time optimizations.
331///
332/// This is carefully structured to produce nice small code -- it's smaller in
333/// `-O` than what the "obvious" ways produces under `-C opt-level=s`.  If you
334/// touch it, be sure to run (and update if needed) the assembly test.
335#[unstable(feature = "str_internals", issue = "none")]
336#[doc(hidden)]
337#[inline]
338pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
339    while let [rest @ .., last] = bytes {
340        if !last.is_ascii() {
341            break;
342        }
343        bytes = rest;
344    }
345    bytes.is_empty()
346}
347
348/// Optimized ASCII test that will use usize-at-a-time operations instead of
349/// byte-at-a-time operations (when possible).
350///
351/// The algorithm we use here is pretty simple. If `s` is too short, we just
352/// check each byte and be done with it. Otherwise:
353///
354/// - Read the first word with an unaligned load.
355/// - Align the pointer, read subsequent words until end with aligned loads.
356/// - Read the last `usize` from `s` with an unaligned load.
357///
358/// If any of these loads produces something for which `contains_nonascii`
359/// (above) returns true, then we know the answer is false.
360#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
361#[inline]
362#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
363const fn is_ascii(s: &[u8]) -> bool {
364    // The runtime version behaves the same as the compiletime version, it's
365    // just more optimized.
366    const_eval_select!(
367        @capture { s: &[u8] } -> bool:
368        if const {
369            is_ascii_simple(s)
370        } else {
371            /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
372            /// from `../str/mod.rs`, which does something similar for utf8 validation.
373            const fn contains_nonascii(v: usize) -> bool {
374                const NONASCII_MASK: usize = usize::repeat_u8(0x80);
375                (NONASCII_MASK & v) != 0
376            }
377
378            const USIZE_SIZE: usize = size_of::<usize>();
379
380            let len = s.len();
381            let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
382
383            // If we wouldn't gain anything from the word-at-a-time implementation, fall
384            // back to a scalar loop.
385            //
386            // We also do this for architectures where `size_of::<usize>()` isn't
387            // sufficient alignment for `usize`, because it's a weird edge case.
388            if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
389                return is_ascii_simple(s);
390            }
391
392            // We always read the first word unaligned, which means `align_offset` is
393            // 0, we'd read the same value again for the aligned read.
394            let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset };
395
396            let start = s.as_ptr();
397            // SAFETY: We verify `len < USIZE_SIZE` above.
398            let first_word = unsafe { (start as *const usize).read_unaligned() };
399
400            if contains_nonascii(first_word) {
401                return false;
402            }
403            // We checked this above, somewhat implicitly. Note that `offset_to_aligned`
404            // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked
405            // above.
406            debug_assert!(offset_to_aligned <= len);
407
408            // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the
409            // middle chunk of the slice.
410            let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize };
411
412            // `byte_pos` is the byte index of `word_ptr`, used for loop end checks.
413            let mut byte_pos = offset_to_aligned;
414
415            // Paranoia check about alignment, since we're about to do a bunch of
416            // unaligned loads. In practice this should be impossible barring a bug in
417            // `align_offset` though.
418            // While this method is allowed to spuriously fail in CTFE, if it doesn't
419            // have alignment information it should have given a `usize::MAX` for
420            // `align_offset` earlier, sending things through the scalar path instead of
421            // this one, so this check should pass if it's reachable.
422            debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
423
424            // Read subsequent words until the last aligned word, excluding the last
425            // aligned word by itself to be done in tail check later, to ensure that
426            // tail is always one `usize` at most to extra branch `byte_pos == len`.
427            while byte_pos < len - USIZE_SIZE {
428                // Sanity check that the read is in bounds
429                debug_assert!(byte_pos + USIZE_SIZE <= len);
430                // And that our assumptions about `byte_pos` hold.
431                debug_assert!(word_ptr.cast::<u8>() == start.wrapping_add(byte_pos));
432
433                // SAFETY: We know `word_ptr` is properly aligned (because of
434                // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
435                let word = unsafe { word_ptr.read() };
436                if contains_nonascii(word) {
437                    return false;
438                }
439
440                byte_pos += USIZE_SIZE;
441                // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that
442                // after this `add`, `word_ptr` will be at most one-past-the-end.
443                word_ptr = unsafe { word_ptr.add(1) };
444            }
445
446            // Sanity check to ensure there really is only one `usize` left. This should
447            // be guaranteed by our loop condition.
448            debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE);
449
450            // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
451            let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() };
452
453            !contains_nonascii(last_word)
454        }
455    )
456}
457
458/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
459/// platforms.
460///
461/// Other platforms are not likely to benefit from this code structure, so they
462/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
463#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
464#[inline]
465const fn is_ascii(bytes: &[u8]) -> bool {
466    // Process chunks of 32 bytes at a time in the fast path to enable
467    // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
468    // can be OR'd together and then the resulting vector can be tested for
469    // non-ASCII bytes.
470    const CHUNK_SIZE: usize = 32;
471
472    let mut i = 0;
473
474    while i + CHUNK_SIZE <= bytes.len() {
475        let chunk_end = i + CHUNK_SIZE;
476
477        // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
478        // creates a mask from the most significant bit of each byte.
479        // ASCII bytes are less than 128 (0x80), so their most significant
480        // bit is unset.
481        let mut count = 0;
482        while i < chunk_end {
483            count += bytes[i].is_ascii() as u8;
484            i += 1;
485        }
486
487        // All bytes should be <= 127 so count is equal to chunk size.
488        if count != CHUNK_SIZE as u8 {
489            return false;
490        }
491    }
492
493    // Process the remaining `bytes.len() % N` bytes.
494    let mut is_ascii = true;
495    while i < bytes.len() {
496        is_ascii &= bytes[i].is_ascii();
497        i += 1;
498    }
499
500    is_ascii
501}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy