alloc/
str.rs

1//! Utilities for the `str` primitive type.
2//!
3//! *[See also the `str` primitive type](str).*
4
5#![stable(feature = "rust1", since = "1.0.0")]
6// Many of the usings in this module are only used in the test configuration.
7// It's cleaner to just turn off the unused_imports warning than to fix them.
8#![allow(unused_imports)]
9
10use core::borrow::{Borrow, BorrowMut};
11use core::iter::FusedIterator;
12use core::mem::MaybeUninit;
13#[stable(feature = "encode_utf16", since = "1.8.0")]
14pub use core::str::EncodeUtf16;
15#[stable(feature = "split_ascii_whitespace", since = "1.34.0")]
16pub use core::str::SplitAsciiWhitespace;
17#[stable(feature = "split_inclusive", since = "1.51.0")]
18pub use core::str::SplitInclusive;
19#[stable(feature = "rust1", since = "1.0.0")]
20pub use core::str::SplitWhitespace;
21#[stable(feature = "rust1", since = "1.0.0")]
22pub use core::str::pattern;
23use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher, Utf8Pattern};
24#[stable(feature = "rust1", since = "1.0.0")]
25pub use core::str::{Bytes, CharIndices, Chars, from_utf8, from_utf8_mut};
26#[stable(feature = "str_escape", since = "1.34.0")]
27pub use core::str::{EscapeDebug, EscapeDefault, EscapeUnicode};
28#[stable(feature = "rust1", since = "1.0.0")]
29pub use core::str::{FromStr, Utf8Error};
30#[allow(deprecated)]
31#[stable(feature = "rust1", since = "1.0.0")]
32pub use core::str::{Lines, LinesAny};
33#[stable(feature = "rust1", since = "1.0.0")]
34pub use core::str::{MatchIndices, RMatchIndices};
35#[stable(feature = "rust1", since = "1.0.0")]
36pub use core::str::{Matches, RMatches};
37#[stable(feature = "rust1", since = "1.0.0")]
38pub use core::str::{ParseBoolError, from_utf8_unchecked, from_utf8_unchecked_mut};
39#[stable(feature = "rust1", since = "1.0.0")]
40pub use core::str::{RSplit, Split};
41#[stable(feature = "rust1", since = "1.0.0")]
42pub use core::str::{RSplitN, SplitN};
43#[stable(feature = "rust1", since = "1.0.0")]
44pub use core::str::{RSplitTerminator, SplitTerminator};
45#[stable(feature = "utf8_chunks", since = "1.79.0")]
46pub use core::str::{Utf8Chunk, Utf8Chunks};
47#[unstable(feature = "str_from_raw_parts", issue = "119206")]
48pub use core::str::{from_raw_parts, from_raw_parts_mut};
49use core::unicode::conversions;
50use core::{mem, ptr};
51
52use crate::borrow::ToOwned;
53use crate::boxed::Box;
54use crate::slice::{Concat, Join, SliceIndex};
55use crate::string::String;
56use crate::vec::Vec;
57
58/// Note: `str` in `Concat<str>` is not meaningful here.
59/// This type parameter of the trait only exists to enable another impl.
60#[cfg(not(no_global_oom_handling))]
61#[unstable(feature = "slice_concat_ext", issue = "27747")]
62impl<S: Borrow<str>> Concat<str> for [S] {
63    type Output = String;
64
65    fn concat(slice: &Self) -> String {
66        Join::join(slice, "")
67    }
68}
69
70#[cfg(not(no_global_oom_handling))]
71#[unstable(feature = "slice_concat_ext", issue = "27747")]
72impl<S: Borrow<str>> Join<&str> for [S] {
73    type Output = String;
74
75    fn join(slice: &Self, sep: &str) -> String {
76        unsafe { String::from_utf8_unchecked(join_generic_copy(slice, sep.as_bytes())) }
77    }
78}
79
80#[cfg(not(no_global_oom_handling))]
81macro_rules! specialize_for_lengths {
82    ($separator:expr, $target:expr, $iter:expr; $($num:expr),*) => {{
83        let mut target = $target;
84        let iter = $iter;
85        let sep_bytes = $separator;
86        match $separator.len() {
87            $(
88                // loops with hardcoded sizes run much faster
89                // specialize the cases with small separator lengths
90                $num => {
91                    for s in iter {
92                        copy_slice_and_advance!(target, sep_bytes);
93                        let content_bytes = s.borrow().as_ref();
94                        copy_slice_and_advance!(target, content_bytes);
95                    }
96                },
97            )*
98            _ => {
99                // arbitrary non-zero size fallback
100                for s in iter {
101                    copy_slice_and_advance!(target, sep_bytes);
102                    let content_bytes = s.borrow().as_ref();
103                    copy_slice_and_advance!(target, content_bytes);
104                }
105            }
106        }
107        target
108    }}
109}
110
111#[cfg(not(no_global_oom_handling))]
112macro_rules! copy_slice_and_advance {
113    ($target:expr, $bytes:expr) => {
114        let len = $bytes.len();
115        let (head, tail) = { $target }.split_at_mut(len);
116        head.copy_from_slice($bytes);
117        $target = tail;
118    };
119}
120
121// Optimized join implementation that works for both Vec<T> (T: Copy) and String's inner vec
122// Currently (2018-05-13) there is a bug with type inference and specialization (see issue #36262)
123// For this reason SliceConcat<T> is not specialized for T: Copy and SliceConcat<str> is the
124// only user of this function. It is left in place for the time when that is fixed.
125//
126// the bounds for String-join are S: Borrow<str> and for Vec-join Borrow<[T]>
127// [T] and str both impl AsRef<[T]> for some T
128// => s.borrow().as_ref() and we always have slices
129#[cfg(not(no_global_oom_handling))]
130fn join_generic_copy<B, T, S>(slice: &[S], sep: &[T]) -> Vec<T>
131where
132    T: Copy,
133    B: AsRef<[T]> + ?Sized,
134    S: Borrow<B>,
135{
136    let sep_len = sep.len();
137    let mut iter = slice.iter();
138
139    // the first slice is the only one without a separator preceding it
140    let first = match iter.next() {
141        Some(first) => first,
142        None => return vec![],
143    };
144
145    // compute the exact total length of the joined Vec
146    // if the `len` calculation overflows, we'll panic
147    // we would have run out of memory anyway and the rest of the function requires
148    // the entire Vec pre-allocated for safety
149    let reserved_len = sep_len
150        .checked_mul(iter.len())
151        .and_then(|n| {
152            slice.iter().map(|s| s.borrow().as_ref().len()).try_fold(n, usize::checked_add)
153        })
154        .expect("attempt to join into collection with len > usize::MAX");
155
156    // prepare an uninitialized buffer
157    let mut result = Vec::with_capacity(reserved_len);
158    debug_assert!(result.capacity() >= reserved_len);
159
160    result.extend_from_slice(first.borrow().as_ref());
161
162    unsafe {
163        let pos = result.len();
164        let target = result.spare_capacity_mut().get_unchecked_mut(..reserved_len - pos);
165
166        // Convert the separator and slices to slices of MaybeUninit
167        // to simplify implementation in specialize_for_lengths
168        let sep_uninit = core::slice::from_raw_parts(sep.as_ptr().cast(), sep.len());
169        let iter_uninit = iter.map(|it| {
170            let it = it.borrow().as_ref();
171            core::slice::from_raw_parts(it.as_ptr().cast(), it.len())
172        });
173
174        // copy separator and slices over without bounds checks
175        // generate loops with hardcoded offsets for small separators
176        // massive improvements possible (~ x2)
177        let remain = specialize_for_lengths!(sep_uninit, target, iter_uninit; 0, 1, 2, 3, 4);
178
179        // A weird borrow implementation may return different
180        // slices for the length calculation and the actual copy.
181        // Make sure we don't expose uninitialized bytes to the caller.
182        let result_len = reserved_len - remain.len();
183        result.set_len(result_len);
184    }
185    result
186}
187
188#[stable(feature = "rust1", since = "1.0.0")]
189impl Borrow<str> for String {
190    #[inline]
191    fn borrow(&self) -> &str {
192        &self[..]
193    }
194}
195
196#[stable(feature = "string_borrow_mut", since = "1.36.0")]
197impl BorrowMut<str> for String {
198    #[inline]
199    fn borrow_mut(&mut self) -> &mut str {
200        &mut self[..]
201    }
202}
203
204#[cfg(not(no_global_oom_handling))]
205#[stable(feature = "rust1", since = "1.0.0")]
206impl ToOwned for str {
207    type Owned = String;
208
209    #[inline]
210    fn to_owned(&self) -> String {
211        unsafe { String::from_utf8_unchecked(self.as_bytes().to_owned()) }
212    }
213
214    #[inline]
215    fn clone_into(&self, target: &mut String) {
216        target.clear();
217        target.push_str(self);
218    }
219}
220
221/// Methods for string slices.
222impl str {
223    /// Converts a `Box<str>` into a `Box<[u8]>` without copying or allocating.
224    ///
225    /// # Examples
226    ///
227    /// ```
228    /// let s = "this is a string";
229    /// let boxed_str = s.to_owned().into_boxed_str();
230    /// let boxed_bytes = boxed_str.into_boxed_bytes();
231    /// assert_eq!(*boxed_bytes, *s.as_bytes());
232    /// ```
233    #[rustc_allow_incoherent_impl]
234    #[stable(feature = "str_box_extras", since = "1.20.0")]
235    #[must_use = "`self` will be dropped if the result is not used"]
236    #[inline]
237    pub fn into_boxed_bytes(self: Box<str>) -> Box<[u8]> {
238        self.into()
239    }
240
241    /// Replaces all matches of a pattern with another string.
242    ///
243    /// `replace` creates a new [`String`], and copies the data from this string slice into it.
244    /// While doing so, it attempts to find matches of a pattern. If it finds any, it
245    /// replaces them with the replacement string slice.
246    ///
247    /// # Examples
248    ///
249    /// Basic usage:
250    ///
251    /// ```
252    /// let s = "this is old";
253    ///
254    /// assert_eq!("this is new", s.replace("old", "new"));
255    /// assert_eq!("than an old", s.replace("is", "an"));
256    /// ```
257    ///
258    /// When the pattern doesn't match, it returns this string slice as [`String`]:
259    ///
260    /// ```
261    /// let s = "this is old";
262    /// assert_eq!(s, s.replace("cookie monster", "little lamb"));
263    /// ```
264    #[cfg(not(no_global_oom_handling))]
265    #[rustc_allow_incoherent_impl]
266    #[must_use = "this returns the replaced string as a new allocation, \
267                  without modifying the original"]
268    #[stable(feature = "rust1", since = "1.0.0")]
269    #[inline]
270    pub fn replace<P: Pattern>(&self, from: P, to: &str) -> String {
271        // Fast path for replacing a single ASCII character with another.
272        if let Some(from_byte) = match from.as_utf8_pattern() {
273            Some(Utf8Pattern::StringPattern([from_byte])) => Some(*from_byte),
274            Some(Utf8Pattern::CharPattern(c)) => c.as_ascii().map(|ascii_char| ascii_char.to_u8()),
275            _ => None,
276        } {
277            if let [to_byte] = to.as_bytes() {
278                return unsafe { replace_ascii(self.as_bytes(), from_byte, *to_byte) };
279            }
280        }
281        // Set result capacity to self.len() when from.len() <= to.len()
282        let default_capacity = match from.as_utf8_pattern() {
283            Some(Utf8Pattern::StringPattern(s)) if s.len() <= to.len() => self.len(),
284            Some(Utf8Pattern::CharPattern(c)) if c.len_utf8() <= to.len() => self.len(),
285            _ => 0,
286        };
287        let mut result = String::with_capacity(default_capacity);
288        let mut last_end = 0;
289        for (start, part) in self.match_indices(from) {
290            result.push_str(unsafe { self.get_unchecked(last_end..start) });
291            result.push_str(to);
292            last_end = start + part.len();
293        }
294        result.push_str(unsafe { self.get_unchecked(last_end..self.len()) });
295        result
296    }
297
298    /// Replaces first N matches of a pattern with another string.
299    ///
300    /// `replacen` creates a new [`String`], and copies the data from this string slice into it.
301    /// While doing so, it attempts to find matches of a pattern. If it finds any, it
302    /// replaces them with the replacement string slice at most `count` times.
303    ///
304    /// # Examples
305    ///
306    /// Basic usage:
307    ///
308    /// ```
309    /// let s = "foo foo 123 foo";
310    /// assert_eq!("new new 123 foo", s.replacen("foo", "new", 2));
311    /// assert_eq!("faa fao 123 foo", s.replacen('o', "a", 3));
312    /// assert_eq!("foo foo new23 foo", s.replacen(char::is_numeric, "new", 1));
313    /// ```
314    ///
315    /// When the pattern doesn't match, it returns this string slice as [`String`]:
316    ///
317    /// ```
318    /// let s = "this is old";
319    /// assert_eq!(s, s.replacen("cookie monster", "little lamb", 10));
320    /// ```
321    #[cfg(not(no_global_oom_handling))]
322    #[rustc_allow_incoherent_impl]
323    #[must_use = "this returns the replaced string as a new allocation, \
324                  without modifying the original"]
325    #[stable(feature = "str_replacen", since = "1.16.0")]
326    pub fn replacen<P: Pattern>(&self, pat: P, to: &str, count: usize) -> String {
327        // Hope to reduce the times of re-allocation
328        let mut result = String::with_capacity(32);
329        let mut last_end = 0;
330        for (start, part) in self.match_indices(pat).take(count) {
331            result.push_str(unsafe { self.get_unchecked(last_end..start) });
332            result.push_str(to);
333            last_end = start + part.len();
334        }
335        result.push_str(unsafe { self.get_unchecked(last_end..self.len()) });
336        result
337    }
338
339    /// Returns the lowercase equivalent of this string slice, as a new [`String`].
340    ///
341    /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
342    /// `Lowercase`.
343    ///
344    /// Since some characters can expand into multiple characters when changing
345    /// the case, this function returns a [`String`] instead of modifying the
346    /// parameter in-place.
347    ///
348    /// # Examples
349    ///
350    /// Basic usage:
351    ///
352    /// ```
353    /// let s = "HELLO";
354    ///
355    /// assert_eq!("hello", s.to_lowercase());
356    /// ```
357    ///
358    /// A tricky example, with sigma:
359    ///
360    /// ```
361    /// let sigma = "Σ";
362    ///
363    /// assert_eq!("σ", sigma.to_lowercase());
364    ///
365    /// // but at the end of a word, it's ς, not σ:
366    /// let odysseus = "ὈΔΥΣΣΕΎΣ";
367    ///
368    /// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase());
369    /// ```
370    ///
371    /// Languages without case are not changed:
372    ///
373    /// ```
374    /// let new_year = "农历新年";
375    ///
376    /// assert_eq!(new_year, new_year.to_lowercase());
377    /// ```
378    #[cfg(not(no_global_oom_handling))]
379    #[rustc_allow_incoherent_impl]
380    #[must_use = "this returns the lowercase string as a new String, \
381                  without modifying the original"]
382    #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
383    pub fn to_lowercase(&self) -> String {
384        let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_lowercase);
385
386        let prefix_len = s.len();
387
388        for (i, c) in rest.char_indices() {
389            if c == 'Σ' {
390                // Σ maps to σ, except at the end of a word where it maps to ς.
391                // This is the only conditional (contextual) but language-independent mapping
392                // in `SpecialCasing.txt`,
393                // so hard-code it rather than have a generic "condition" mechanism.
394                // See https://github.com/rust-lang/rust/issues/26035
395                let sigma_lowercase = map_uppercase_sigma(self, prefix_len + i);
396                s.push(sigma_lowercase);
397            } else {
398                match conversions::to_lower(c) {
399                    [a, '\0', _] => s.push(a),
400                    [a, b, '\0'] => {
401                        s.push(a);
402                        s.push(b);
403                    }
404                    [a, b, c] => {
405                        s.push(a);
406                        s.push(b);
407                        s.push(c);
408                    }
409                }
410            }
411        }
412        return s;
413
414        fn map_uppercase_sigma(from: &str, i: usize) -> char {
415            // See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
416            // for the definition of `Final_Sigma`.
417            debug_assert!('Σ'.len_utf8() == 2);
418            let is_word_final = case_ignorable_then_cased(from[..i].chars().rev())
419                && !case_ignorable_then_cased(from[i + 2..].chars());
420            if is_word_final { 'ς' } else { 'σ' }
421        }
422
423        fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
424            use core::unicode::{Case_Ignorable, Cased};
425            match iter.skip_while(|&c| Case_Ignorable(c)).next() {
426                Some(c) => Cased(c),
427                None => false,
428            }
429        }
430    }
431
432    /// Returns the uppercase equivalent of this string slice, as a new [`String`].
433    ///
434    /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
435    /// `Uppercase`.
436    ///
437    /// Since some characters can expand into multiple characters when changing
438    /// the case, this function returns a [`String`] instead of modifying the
439    /// parameter in-place.
440    ///
441    /// # Examples
442    ///
443    /// Basic usage:
444    ///
445    /// ```
446    /// let s = "hello";
447    ///
448    /// assert_eq!("HELLO", s.to_uppercase());
449    /// ```
450    ///
451    /// Scripts without case are not changed:
452    ///
453    /// ```
454    /// let new_year = "农历新年";
455    ///
456    /// assert_eq!(new_year, new_year.to_uppercase());
457    /// ```
458    ///
459    /// One character can become multiple:
460    /// ```
461    /// let s = "tschüß";
462    ///
463    /// assert_eq!("TSCHÜSS", s.to_uppercase());
464    /// ```
465    #[cfg(not(no_global_oom_handling))]
466    #[rustc_allow_incoherent_impl]
467    #[must_use = "this returns the uppercase string as a new String, \
468                  without modifying the original"]
469    #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
470    pub fn to_uppercase(&self) -> String {
471        let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_uppercase);
472
473        for c in rest.chars() {
474            match conversions::to_upper(c) {
475                [a, '\0', _] => s.push(a),
476                [a, b, '\0'] => {
477                    s.push(a);
478                    s.push(b);
479                }
480                [a, b, c] => {
481                    s.push(a);
482                    s.push(b);
483                    s.push(c);
484                }
485            }
486        }
487        s
488    }
489
490    /// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
491    ///
492    /// # Examples
493    ///
494    /// ```
495    /// let string = String::from("birthday gift");
496    /// let boxed_str = string.clone().into_boxed_str();
497    ///
498    /// assert_eq!(boxed_str.into_string(), string);
499    /// ```
500    #[stable(feature = "box_str", since = "1.4.0")]
501    #[rustc_allow_incoherent_impl]
502    #[must_use = "`self` will be dropped if the result is not used"]
503    #[inline]
504    pub fn into_string(self: Box<str>) -> String {
505        let slice = Box::<[u8]>::from(self);
506        unsafe { String::from_utf8_unchecked(slice.into_vec()) }
507    }
508
509    /// Creates a new [`String`] by repeating a string `n` times.
510    ///
511    /// # Panics
512    ///
513    /// This function will panic if the capacity would overflow.
514    ///
515    /// # Examples
516    ///
517    /// Basic usage:
518    ///
519    /// ```
520    /// assert_eq!("abc".repeat(4), String::from("abcabcabcabc"));
521    /// ```
522    ///
523    /// A panic upon overflow:
524    ///
525    /// ```should_panic
526    /// // this will panic at runtime
527    /// let huge = "0123456789abcdef".repeat(usize::MAX);
528    /// ```
529    #[cfg(not(no_global_oom_handling))]
530    #[rustc_allow_incoherent_impl]
531    #[must_use]
532    #[stable(feature = "repeat_str", since = "1.16.0")]
533    #[inline]
534    pub fn repeat(&self, n: usize) -> String {
535        unsafe { String::from_utf8_unchecked(self.as_bytes().repeat(n)) }
536    }
537
538    /// Returns a copy of this string where each character is mapped to its
539    /// ASCII upper case equivalent.
540    ///
541    /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
542    /// but non-ASCII letters are unchanged.
543    ///
544    /// To uppercase the value in-place, use [`make_ascii_uppercase`].
545    ///
546    /// To uppercase ASCII characters in addition to non-ASCII characters, use
547    /// [`to_uppercase`].
548    ///
549    /// # Examples
550    ///
551    /// ```
552    /// let s = "Grüße, Jürgen ❤";
553    ///
554    /// assert_eq!("GRüßE, JüRGEN ❤", s.to_ascii_uppercase());
555    /// ```
556    ///
557    /// [`make_ascii_uppercase`]: str::make_ascii_uppercase
558    /// [`to_uppercase`]: #method.to_uppercase
559    #[cfg(not(no_global_oom_handling))]
560    #[rustc_allow_incoherent_impl]
561    #[must_use = "to uppercase the value in-place, use `make_ascii_uppercase()`"]
562    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
563    #[inline]
564    pub fn to_ascii_uppercase(&self) -> String {
565        let mut s = self.to_owned();
566        s.make_ascii_uppercase();
567        s
568    }
569
570    /// Returns a copy of this string where each character is mapped to its
571    /// ASCII lower case equivalent.
572    ///
573    /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
574    /// but non-ASCII letters are unchanged.
575    ///
576    /// To lowercase the value in-place, use [`make_ascii_lowercase`].
577    ///
578    /// To lowercase ASCII characters in addition to non-ASCII characters, use
579    /// [`to_lowercase`].
580    ///
581    /// # Examples
582    ///
583    /// ```
584    /// let s = "Grüße, Jürgen ❤";
585    ///
586    /// assert_eq!("grüße, jürgen ❤", s.to_ascii_lowercase());
587    /// ```
588    ///
589    /// [`make_ascii_lowercase`]: str::make_ascii_lowercase
590    /// [`to_lowercase`]: #method.to_lowercase
591    #[cfg(not(no_global_oom_handling))]
592    #[rustc_allow_incoherent_impl]
593    #[must_use = "to lowercase the value in-place, use `make_ascii_lowercase()`"]
594    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
595    #[inline]
596    pub fn to_ascii_lowercase(&self) -> String {
597        let mut s = self.to_owned();
598        s.make_ascii_lowercase();
599        s
600    }
601}
602
603/// Converts a boxed slice of bytes to a boxed string slice without checking
604/// that the string contains valid UTF-8.
605///
606/// # Safety
607///
608/// * The provided bytes must contain a valid UTF-8 sequence.
609///
610/// # Examples
611///
612/// ```
613/// let smile_utf8 = Box::new([226, 152, 186]);
614/// let smile = unsafe { std::str::from_boxed_utf8_unchecked(smile_utf8) };
615///
616/// assert_eq!("☺", &*smile);
617/// ```
618#[stable(feature = "str_box_extras", since = "1.20.0")]
619#[must_use]
620#[inline]
621pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
622    unsafe { Box::from_raw(Box::into_raw(v) as *mut str) }
623}
624
625/// Converts leading ascii bytes in `s` by calling the `convert` function.
626///
627/// For better average performance, this happens in chunks of `2*size_of::<usize>()`.
628///
629/// Returns a tuple of the converted prefix and the remainder starting from
630/// the first non-ascii character.
631///
632/// This function is only public so that it can be verified in a codegen test,
633/// see `issue-123712-str-to-lower-autovectorization.rs`.
634#[unstable(feature = "str_internals", issue = "none")]
635#[doc(hidden)]
636#[inline]
637#[cfg(not(no_global_oom_handling))]
638pub fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
639    // Process the input in chunks of 16 bytes to enable auto-vectorization.
640    // Previously the chunk size depended on the size of `usize`,
641    // but on 32-bit platforms with sse or neon is also the better choice.
642    // The only downside on other platforms would be a bit more loop-unrolling.
643    const N: usize = 16;
644
645    let mut slice = s.as_bytes();
646    let mut out = Vec::with_capacity(slice.len());
647    let mut out_slice = out.spare_capacity_mut();
648
649    let mut ascii_prefix_len = 0_usize;
650    let mut is_ascii = [false; N];
651
652    while slice.len() >= N {
653        // SAFETY: checked in loop condition
654        let chunk = unsafe { slice.get_unchecked(..N) };
655        // SAFETY: out_slice has at least same length as input slice and gets sliced with the same offsets
656        let out_chunk = unsafe { out_slice.get_unchecked_mut(..N) };
657
658        for j in 0..N {
659            is_ascii[j] = chunk[j] <= 127;
660        }
661
662        // Auto-vectorization for this check is a bit fragile, sum and comparing against the chunk
663        // size gives the best result, specifically a pmovmsk instruction on x86.
664        // See https://github.com/llvm/llvm-project/issues/96395 for why llvm currently does not
665        // currently recognize other similar idioms.
666        if is_ascii.iter().map(|x| *x as u8).sum::<u8>() as usize != N {
667            break;
668        }
669
670        for j in 0..N {
671            out_chunk[j] = MaybeUninit::new(convert(&chunk[j]));
672        }
673
674        ascii_prefix_len += N;
675        slice = unsafe { slice.get_unchecked(N..) };
676        out_slice = unsafe { out_slice.get_unchecked_mut(N..) };
677    }
678
679    // handle the remainder as individual bytes
680    while slice.len() > 0 {
681        let byte = slice[0];
682        if byte > 127 {
683            break;
684        }
685        // SAFETY: out_slice has at least same length as input slice
686        unsafe {
687            *out_slice.get_unchecked_mut(0) = MaybeUninit::new(convert(&byte));
688        }
689        ascii_prefix_len += 1;
690        slice = unsafe { slice.get_unchecked(1..) };
691        out_slice = unsafe { out_slice.get_unchecked_mut(1..) };
692    }
693
694    unsafe {
695        // SAFETY: ascii_prefix_len bytes have been initialized above
696        out.set_len(ascii_prefix_len);
697
698        // SAFETY: We have written only valid ascii to the output vec
699        let ascii_string = String::from_utf8_unchecked(out);
700
701        // SAFETY: we know this is a valid char boundary
702        // since we only skipped over leading ascii bytes
703        let rest = core::str::from_utf8_unchecked(slice);
704
705        (ascii_string, rest)
706    }
707}
708#[inline]
709#[cfg(not(no_global_oom_handling))]
710#[allow(dead_code)]
711/// Faster implementation of string replacement for ASCII to ASCII cases.
712/// Should produce fast vectorized code.
713unsafe fn replace_ascii(utf8_bytes: &[u8], from: u8, to: u8) -> String {
714    let result: Vec<u8> = utf8_bytes.iter().map(|b| if *b == from { to } else { *b }).collect();
715    // SAFETY: We replaced ascii with ascii on valid utf8 strings.
716    unsafe { String::from_utf8_unchecked(result) }
717}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy