core/str/
lossy.rs

1use super::from_utf8_unchecked;
2use super::validations::utf8_char_width;
3use crate::fmt;
4use crate::fmt::{Formatter, Write};
5use crate::iter::FusedIterator;
6
7impl [u8] {
8    /// Creates an iterator over the contiguous valid UTF-8 ranges of this
9    /// slice, and the non-UTF-8 fragments in between.
10    ///
11    /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12    ///
13    /// # Examples
14    ///
15    /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16    /// code in the form of a C-string literal (`c"..."`).
17    ///
18    /// ```
19    /// use std::fmt::Write as _;
20    ///
21    /// pub fn cstr_literal(bytes: &[u8]) -> String {
22    ///     let mut repr = String::new();
23    ///     repr.push_str("c\"");
24    ///     for chunk in bytes.utf8_chunks() {
25    ///         for ch in chunk.valid().chars() {
26    ///             // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27    ///             write!(repr, "{}", ch.escape_debug()).unwrap();
28    ///         }
29    ///         for byte in chunk.invalid() {
30    ///             write!(repr, "\\x{:02X}", byte).unwrap();
31    ///         }
32    ///     }
33    ///     repr.push('"');
34    ///     repr
35    /// }
36    ///
37    /// fn main() {
38    ///     let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
39    ///     let expected = stringify!(c"\xFErris the 🦀\u{7}");
40    ///     assert_eq!(lit, expected);
41    /// }
42    /// ```
43    #[stable(feature = "utf8_chunks", since = "1.79.0")]
44    pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45        Utf8Chunks { source: self }
46    }
47}
48
49/// An item returned by the [`Utf8Chunks`] iterator.
50///
51/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52/// when decoding a UTF-8 string.
53///
54/// # Examples
55///
56/// ```
57/// // An invalid UTF-8 string
58/// let bytes = b"foo\xF1\x80bar";
59///
60/// // Decode the first `Utf8Chunk`
61/// let chunk = bytes.utf8_chunks().next().unwrap();
62///
63/// // The first three characters are valid UTF-8
64/// assert_eq!("foo", chunk.valid());
65///
66/// // The fourth character is broken
67/// assert_eq!(b"\xF1\x80", chunk.invalid());
68/// ```
69#[stable(feature = "utf8_chunks", since = "1.79.0")]
70#[derive(Clone, Debug, PartialEq, Eq)]
71pub struct Utf8Chunk<'a> {
72    valid: &'a str,
73    invalid: &'a [u8],
74}
75
76impl<'a> Utf8Chunk<'a> {
77    /// Returns the next validated UTF-8 substring.
78    ///
79    /// This substring can be empty at the start of the string or between
80    /// broken UTF-8 characters.
81    #[must_use]
82    #[stable(feature = "utf8_chunks", since = "1.79.0")]
83    pub fn valid(&self) -> &'a str {
84        self.valid
85    }
86
87    /// Returns the invalid sequence that caused a failure.
88    ///
89    /// The returned slice will have a maximum length of 3 and starts after the
90    /// substring given by [`valid`]. Decoding will resume after this sequence.
91    ///
92    /// If empty, this is the last chunk in the string. If non-empty, an
93    /// unexpected byte was encountered or the end of the input was reached
94    /// unexpectedly.
95    ///
96    /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97    /// CHARACTER`].
98    ///
99    /// [`valid`]: Self::valid
100    /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101    #[must_use]
102    #[stable(feature = "utf8_chunks", since = "1.79.0")]
103    pub fn invalid(&self) -> &'a [u8] {
104        self.invalid
105    }
106}
107
108#[must_use]
109#[unstable(feature = "str_internals", issue = "none")]
110pub struct Debug<'a>(&'a [u8]);
111
112#[unstable(feature = "str_internals", issue = "none")]
113impl fmt::Debug for Debug<'_> {
114    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115        f.write_char('"')?;
116
117        for chunk in self.0.utf8_chunks() {
118            // Valid part.
119            // Here we partially parse UTF-8 again which is suboptimal.
120            {
121                let valid = chunk.valid();
122                let mut from = 0;
123                for (i, c) in valid.char_indices() {
124                    let esc = c.escape_debug();
125                    // If char needs escaping, flush backlog so far and write, else skip
126                    if esc.len() != 1 {
127                        f.write_str(&valid[from..i])?;
128                        for c in esc {
129                            f.write_char(c)?;
130                        }
131                        from = i + c.len_utf8();
132                    }
133                }
134                f.write_str(&valid[from..])?;
135            }
136
137            // Broken parts of string as hex escape.
138            for &b in chunk.invalid() {
139                write!(f, "\\x{:02X}", b)?;
140            }
141        }
142
143        f.write_char('"')
144    }
145}
146
147/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149///
150/// This struct is created by the [`utf8_chunks`] method on bytes slices.
151/// If you want a simple conversion from UTF-8 byte slices to string slices,
152/// [`from_utf8`] is easier to use.
153///
154/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
155///
156/// [byteslice]: slice
157/// [`utf8_chunks`]: slice::utf8_chunks
158/// [`from_utf8`]: super::from_utf8
159///
160/// # Examples
161///
162/// This can be used to create functionality similar to
163/// [`String::from_utf8_lossy`] without allocating heap memory:
164///
165/// ```
166/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
167///     for chunk in input.utf8_chunks() {
168///         push(chunk.valid());
169///
170///         if !chunk.invalid().is_empty() {
171///             push("\u{FFFD}");
172///         }
173///     }
174/// }
175/// ```
176///
177/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
178#[must_use = "iterators are lazy and do nothing unless consumed"]
179#[stable(feature = "utf8_chunks", since = "1.79.0")]
180#[derive(Clone)]
181pub struct Utf8Chunks<'a> {
182    source: &'a [u8],
183}
184
185impl<'a> Utf8Chunks<'a> {
186    #[doc(hidden)]
187    #[unstable(feature = "str_internals", issue = "none")]
188    pub fn debug(&self) -> Debug<'_> {
189        Debug(self.source)
190    }
191}
192
193#[stable(feature = "utf8_chunks", since = "1.79.0")]
194impl<'a> Iterator for Utf8Chunks<'a> {
195    type Item = Utf8Chunk<'a>;
196
197    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
198        if self.source.is_empty() {
199            return None;
200        }
201
202        const TAG_CONT_U8: u8 = 128;
203        fn safe_get(xs: &[u8], i: usize) -> u8 {
204            *xs.get(i).unwrap_or(&0)
205        }
206
207        let mut i = 0;
208        let mut valid_up_to = 0;
209        while i < self.source.len() {
210            // SAFETY: `i < self.source.len()` per previous line.
211            // For some reason the following are both significantly slower:
212            // while let Some(&byte) = self.source.get(i) {
213            // while let Some(byte) = self.source.get(i).copied() {
214            let byte = unsafe { *self.source.get_unchecked(i) };
215            i += 1;
216
217            if byte < 128 {
218                // This could be a `1 => ...` case in the match below, but for
219                // the common case of all-ASCII inputs, we bypass loading the
220                // sizeable UTF8_CHAR_WIDTH table into cache.
221            } else {
222                let w = utf8_char_width(byte);
223
224                match w {
225                    2 => {
226                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
227                            break;
228                        }
229                        i += 1;
230                    }
231                    3 => {
232                        match (byte, safe_get(self.source, i)) {
233                            (0xE0, 0xA0..=0xBF) => (),
234                            (0xE1..=0xEC, 0x80..=0xBF) => (),
235                            (0xED, 0x80..=0x9F) => (),
236                            (0xEE..=0xEF, 0x80..=0xBF) => (),
237                            _ => break,
238                        }
239                        i += 1;
240                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
241                            break;
242                        }
243                        i += 1;
244                    }
245                    4 => {
246                        match (byte, safe_get(self.source, i)) {
247                            (0xF0, 0x90..=0xBF) => (),
248                            (0xF1..=0xF3, 0x80..=0xBF) => (),
249                            (0xF4, 0x80..=0x8F) => (),
250                            _ => break,
251                        }
252                        i += 1;
253                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
254                            break;
255                        }
256                        i += 1;
257                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
258                            break;
259                        }
260                        i += 1;
261                    }
262                    _ => break,
263                }
264            }
265
266            valid_up_to = i;
267        }
268
269        // SAFETY: `i <= self.source.len()` because it is only ever incremented
270        // via `i += 1` and in between every single one of those increments, `i`
271        // is compared against `self.source.len()`. That happens either
272        // literally by `i < self.source.len()` in the while-loop's condition,
273        // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
274        // loop is terminated as soon as the latest `i += 1` has made `i` no
275        // longer less than `self.source.len()`, which means it'll be at most
276        // equal to `self.source.len()`.
277        let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
278        self.source = remaining;
279
280        // SAFETY: `valid_up_to <= i` because it is only ever assigned via
281        // `valid_up_to = i` and `i` only increases.
282        let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
283
284        Some(Utf8Chunk {
285            // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
286            valid: unsafe { from_utf8_unchecked(valid) },
287            invalid,
288        })
289    }
290}
291
292#[stable(feature = "utf8_chunks", since = "1.79.0")]
293impl FusedIterator for Utf8Chunks<'_> {}
294
295#[stable(feature = "utf8_chunks", since = "1.79.0")]
296impl fmt::Debug for Utf8Chunks<'_> {
297    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
298        f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
299    }
300}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy