std/sys_common/
wtf8.rs

1//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
2//!
3//! This library uses Rust’s type system to maintain
4//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
5//! like the `String` and `&str` types do for UTF-8.
6//!
7//! Since [WTF-8 must not be used
8//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
9//! this library deliberately does not provide access to the underlying bytes
10//! of WTF-8 strings,
11//! nor can it decode WTF-8 from arbitrary bytes.
12//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
13
14// this module is imported from @SimonSapin's repo and has tons of dead code on
15// unix (it's mostly used on windows), so don't worry about dead code here.
16#![allow(dead_code)]
17
18#[cfg(test)]
19mod tests;
20
21use core::char::{MAX_LEN_UTF8, MAX_LEN_UTF16, encode_utf8_raw, encode_utf16_raw};
22use core::clone::CloneToUninit;
23use core::str::next_code_point;
24
25use crate::borrow::Cow;
26use crate::collections::TryReserveError;
27use crate::hash::{Hash, Hasher};
28use crate::iter::FusedIterator;
29use crate::rc::Rc;
30use crate::sync::Arc;
31use crate::sys_common::AsInner;
32use crate::{fmt, mem, ops, slice, str};
33
34const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
35
36/// A Unicode code point: from U+0000 to U+10FFFF.
37///
38/// Compares with the `char` type,
39/// which represents a Unicode scalar value:
40/// a code point that is not a surrogate (U+D800 to U+DFFF).
41#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
42pub struct CodePoint {
43    value: u32,
44}
45
46/// Format the code point as `U+` followed by four to six hexadecimal digits.
47/// Example: `U+1F4A9`
48impl fmt::Debug for CodePoint {
49    #[inline]
50    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
51        write!(formatter, "U+{:04X}", self.value)
52    }
53}
54
55impl CodePoint {
56    /// Unsafely creates a new `CodePoint` without checking the value.
57    ///
58    /// Only use when `value` is known to be less than or equal to 0x10FFFF.
59    #[inline]
60    pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
61        CodePoint { value }
62    }
63
64    /// Creates a new `CodePoint` if the value is a valid code point.
65    ///
66    /// Returns `None` if `value` is above 0x10FFFF.
67    #[inline]
68    pub fn from_u32(value: u32) -> Option<CodePoint> {
69        match value {
70            0..=0x10FFFF => Some(CodePoint { value }),
71            _ => None,
72        }
73    }
74
75    /// Creates a new `CodePoint` from a `char`.
76    ///
77    /// Since all Unicode scalar values are code points, this always succeeds.
78    #[inline]
79    pub fn from_char(value: char) -> CodePoint {
80        CodePoint { value: value as u32 }
81    }
82
83    /// Returns the numeric value of the code point.
84    #[inline]
85    pub fn to_u32(&self) -> u32 {
86        self.value
87    }
88
89    /// Returns the numeric value of the code point if it is a leading surrogate.
90    #[inline]
91    pub fn to_lead_surrogate(&self) -> Option<u16> {
92        match self.value {
93            lead @ 0xD800..=0xDBFF => Some(lead as u16),
94            _ => None,
95        }
96    }
97
98    /// Returns the numeric value of the code point if it is a trailing surrogate.
99    #[inline]
100    pub fn to_trail_surrogate(&self) -> Option<u16> {
101        match self.value {
102            trail @ 0xDC00..=0xDFFF => Some(trail as u16),
103            _ => None,
104        }
105    }
106
107    /// Optionally returns a Unicode scalar value for the code point.
108    ///
109    /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
110    #[inline]
111    pub fn to_char(&self) -> Option<char> {
112        match self.value {
113            0xD800..=0xDFFF => None,
114            _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
115        }
116    }
117
118    /// Returns a Unicode scalar value for the code point.
119    ///
120    /// Returns `'\u{FFFD}'` (the replacement character “�”)
121    /// if the code point is a surrogate (from U+D800 to U+DFFF).
122    #[inline]
123    pub fn to_char_lossy(&self) -> char {
124        self.to_char().unwrap_or('\u{FFFD}')
125    }
126}
127
128/// An owned, growable string of well-formed WTF-8 data.
129///
130/// Similar to `String`, but can additionally contain surrogate code points
131/// if they’re not in a surrogate pair.
132#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
133pub struct Wtf8Buf {
134    bytes: Vec<u8>,
135
136    /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
137    /// know this if we're constructed from a `String` or `&str`.
138    ///
139    /// It is possible for `bytes` to have valid UTF-8 without this being
140    /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
141    /// paired, as we don't bother to rescan the entire string.
142    is_known_utf8: bool,
143}
144
145impl ops::Deref for Wtf8Buf {
146    type Target = Wtf8;
147
148    fn deref(&self) -> &Wtf8 {
149        self.as_slice()
150    }
151}
152
153impl ops::DerefMut for Wtf8Buf {
154    fn deref_mut(&mut self) -> &mut Wtf8 {
155        self.as_mut_slice()
156    }
157}
158
159/// Formats the string in double quotes, with characters escaped according to
160/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
161/// where each `x` is a hexadecimal digit.
162///
163/// For example, the code units [U+0061, U+D800, U+000A] are formatted as
164/// `"a\u{D800}\n"`.
165impl fmt::Debug for Wtf8Buf {
166    #[inline]
167    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
168        fmt::Debug::fmt(&**self, formatter)
169    }
170}
171
172/// Formats the string with unpaired surrogates substituted with the replacement
173/// character, U+FFFD.
174impl fmt::Display for Wtf8Buf {
175    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
176        if let Some(s) = self.as_known_utf8() {
177            fmt::Display::fmt(s, formatter)
178        } else {
179            fmt::Display::fmt(&**self, formatter)
180        }
181    }
182}
183
184impl Wtf8Buf {
185    /// Creates a new, empty WTF-8 string.
186    #[inline]
187    pub fn new() -> Wtf8Buf {
188        Wtf8Buf { bytes: Vec::new(), is_known_utf8: true }
189    }
190
191    /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
192    #[inline]
193    pub fn with_capacity(capacity: usize) -> Wtf8Buf {
194        Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
195    }
196
197    /// Creates a WTF-8 string from a WTF-8 byte vec.
198    ///
199    /// Since the byte vec is not checked for valid WTF-8, this function is
200    /// marked unsafe.
201    #[inline]
202    pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
203        Wtf8Buf { bytes: value, is_known_utf8: false }
204    }
205
206    /// Creates a WTF-8 string from a UTF-8 `String`.
207    ///
208    /// This takes ownership of the `String` and does not copy.
209    ///
210    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
211    #[inline]
212    pub const fn from_string(string: String) -> Wtf8Buf {
213        Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true }
214    }
215
216    /// Creates a WTF-8 string from a UTF-8 `&str` slice.
217    ///
218    /// This copies the content of the slice.
219    ///
220    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
221    #[inline]
222    pub fn from_str(s: &str) -> Wtf8Buf {
223        Wtf8Buf { bytes: s.as_bytes().to_vec(), is_known_utf8: true }
224    }
225
226    pub fn clear(&mut self) {
227        self.bytes.clear();
228        self.is_known_utf8 = true;
229    }
230
231    /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
232    ///
233    /// This is lossless: calling `.encode_wide()` on the resulting string
234    /// will always return the original code units.
235    pub fn from_wide(v: &[u16]) -> Wtf8Buf {
236        let mut string = Wtf8Buf::with_capacity(v.len());
237        for item in char::decode_utf16(v.iter().cloned()) {
238            match item {
239                Ok(ch) => string.push_char(ch),
240                Err(surrogate) => {
241                    let surrogate = surrogate.unpaired_surrogate();
242                    // Surrogates are known to be in the code point range.
243                    let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
244                    // The string will now contain an unpaired surrogate.
245                    string.is_known_utf8 = false;
246                    // Skip the WTF-8 concatenation check,
247                    // surrogate pairs are already decoded by decode_utf16
248                    string.push_code_point_unchecked(code_point);
249                }
250            }
251        }
252        string
253    }
254
255    /// Appends the given `char` to the end of this string.
256    /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
257    /// Copied from String::push.
258    fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
259        let mut bytes = [0; MAX_LEN_UTF8];
260        let bytes = encode_utf8_raw(code_point.value, &mut bytes);
261        self.bytes.extend_from_slice(bytes)
262    }
263
264    #[inline]
265    pub fn as_slice(&self) -> &Wtf8 {
266        unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
267    }
268
269    #[inline]
270    pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
271        // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
272        // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
273        // which would break the assumptions of the `is_known_utf8` field.
274        unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
275    }
276
277    /// Converts the string to UTF-8 without validation, if it was created from
278    /// valid UTF-8.
279    #[inline]
280    fn as_known_utf8(&self) -> Option<&str> {
281        if self.is_known_utf8 {
282            // SAFETY: The buffer is known to be valid UTF-8.
283            Some(unsafe { str::from_utf8_unchecked(self.as_bytes()) })
284        } else {
285            None
286        }
287    }
288
289    /// Reserves capacity for at least `additional` more bytes to be inserted
290    /// in the given `Wtf8Buf`.
291    /// The collection may reserve more space to avoid frequent reallocations.
292    ///
293    /// # Panics
294    ///
295    /// Panics if the new capacity exceeds `isize::MAX` bytes.
296    #[inline]
297    pub fn reserve(&mut self, additional: usize) {
298        self.bytes.reserve(additional)
299    }
300
301    /// Tries to reserve capacity for at least `additional` more bytes to be
302    /// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to
303    /// avoid frequent reallocations. After calling `try_reserve`, capacity will
304    /// be greater than or equal to `self.len() + additional`. Does nothing if
305    /// capacity is already sufficient. This method preserves the contents even
306    /// if an error occurs.
307    ///
308    /// # Errors
309    ///
310    /// If the capacity overflows, or the allocator reports a failure, then an error
311    /// is returned.
312    #[inline]
313    pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
314        self.bytes.try_reserve(additional)
315    }
316
317    #[inline]
318    pub fn reserve_exact(&mut self, additional: usize) {
319        self.bytes.reserve_exact(additional)
320    }
321
322    /// Tries to reserve the minimum capacity for exactly `additional` more
323    /// bytes to be inserted in the given `Wtf8Buf`. After calling
324    /// `try_reserve_exact`, capacity will be greater than or equal to
325    /// `self.len() + additional` if it returns `Ok(())`.
326    /// Does nothing if the capacity is already sufficient.
327    ///
328    /// Note that the allocator may give the `Wtf8Buf` more space than it
329    /// requests. Therefore, capacity can not be relied upon to be precisely
330    /// minimal. Prefer [`try_reserve`] if future insertions are expected.
331    ///
332    /// [`try_reserve`]: Wtf8Buf::try_reserve
333    ///
334    /// # Errors
335    ///
336    /// If the capacity overflows, or the allocator reports a failure, then an error
337    /// is returned.
338    #[inline]
339    pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
340        self.bytes.try_reserve_exact(additional)
341    }
342
343    #[inline]
344    pub fn shrink_to_fit(&mut self) {
345        self.bytes.shrink_to_fit()
346    }
347
348    #[inline]
349    pub fn shrink_to(&mut self, min_capacity: usize) {
350        self.bytes.shrink_to(min_capacity)
351    }
352
353    #[inline]
354    pub fn leak<'a>(self) -> &'a mut Wtf8 {
355        unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) }
356    }
357
358    /// Returns the number of bytes that this string buffer can hold without reallocating.
359    #[inline]
360    pub fn capacity(&self) -> usize {
361        self.bytes.capacity()
362    }
363
364    /// Append a UTF-8 slice at the end of the string.
365    #[inline]
366    pub fn push_str(&mut self, other: &str) {
367        self.bytes.extend_from_slice(other.as_bytes())
368    }
369
370    /// Append a WTF-8 slice at the end of the string.
371    ///
372    /// This replaces newly paired surrogates at the boundary
373    /// with a supplementary code point,
374    /// like concatenating ill-formed UTF-16 strings effectively would.
375    #[inline]
376    pub fn push_wtf8(&mut self, other: &Wtf8) {
377        match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
378            // Replace newly paired surrogates by a supplementary code point.
379            (Some(lead), Some(trail)) => {
380                let len_without_lead_surrogate = self.len() - 3;
381                self.bytes.truncate(len_without_lead_surrogate);
382                let other_without_trail_surrogate = &other.bytes[3..];
383                // 4 bytes for the supplementary code point
384                self.bytes.reserve(4 + other_without_trail_surrogate.len());
385                self.push_char(decode_surrogate_pair(lead, trail));
386                self.bytes.extend_from_slice(other_without_trail_surrogate);
387            }
388            _ => {
389                // If we'll be pushing a string containing a surrogate, we may
390                // no longer have UTF-8.
391                if self.is_known_utf8 && other.next_surrogate(0).is_some() {
392                    self.is_known_utf8 = false;
393                }
394
395                self.bytes.extend_from_slice(&other.bytes);
396            }
397        }
398    }
399
400    /// Append a Unicode scalar value at the end of the string.
401    #[inline]
402    pub fn push_char(&mut self, c: char) {
403        self.push_code_point_unchecked(CodePoint::from_char(c))
404    }
405
406    /// Append a code point at the end of the string.
407    ///
408    /// This replaces newly paired surrogates at the boundary
409    /// with a supplementary code point,
410    /// like concatenating ill-formed UTF-16 strings effectively would.
411    #[inline]
412    pub fn push(&mut self, code_point: CodePoint) {
413        if let Some(trail) = code_point.to_trail_surrogate() {
414            if let Some(lead) = (&*self).final_lead_surrogate() {
415                let len_without_lead_surrogate = self.len() - 3;
416                self.bytes.truncate(len_without_lead_surrogate);
417                self.push_char(decode_surrogate_pair(lead, trail));
418                return;
419            }
420
421            // We're pushing a trailing surrogate.
422            self.is_known_utf8 = false;
423        } else if code_point.to_lead_surrogate().is_some() {
424            // We're pushing a leading surrogate.
425            self.is_known_utf8 = false;
426        }
427
428        // No newly paired surrogates at the boundary.
429        self.push_code_point_unchecked(code_point)
430    }
431
432    /// Shortens a string to the specified length.
433    ///
434    /// # Panics
435    ///
436    /// Panics if `new_len` > current length,
437    /// or if `new_len` is not a code point boundary.
438    #[inline]
439    pub fn truncate(&mut self, new_len: usize) {
440        assert!(is_code_point_boundary(self, new_len));
441        self.bytes.truncate(new_len)
442    }
443
444    /// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
445    #[inline]
446    pub fn into_bytes(self) -> Vec<u8> {
447        self.bytes
448    }
449
450    /// Consumes the WTF-8 string and tries to convert it to UTF-8.
451    ///
452    /// This does not copy the data.
453    ///
454    /// If the contents are not well-formed UTF-8
455    /// (that is, if the string contains surrogates),
456    /// the original WTF-8 string is returned instead.
457    pub fn into_string(self) -> Result<String, Wtf8Buf> {
458        if self.is_known_utf8 || self.next_surrogate(0).is_none() {
459            Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
460        } else {
461            Err(self)
462        }
463    }
464
465    /// Consumes the WTF-8 string and converts it lossily to UTF-8.
466    ///
467    /// This does not copy the data (but may overwrite parts of it in place).
468    ///
469    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
470    pub fn into_string_lossy(mut self) -> String {
471        if !self.is_known_utf8 {
472            let mut pos = 0;
473            while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
474                pos = surrogate_pos + 3;
475                // Surrogates and the replacement character are all 3 bytes, so
476                // they can substituted in-place.
477                self.bytes[surrogate_pos..pos]
478                    .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
479            }
480        }
481        unsafe { String::from_utf8_unchecked(self.bytes) }
482    }
483
484    /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
485    #[inline]
486    pub fn into_box(self) -> Box<Wtf8> {
487        // SAFETY: relies on `Wtf8` being `repr(transparent)`.
488        unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
489    }
490
491    /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
492    pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
493        let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
494        Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false }
495    }
496
497    /// Provides plumbing to core `Vec::extend_from_slice`.
498    /// More well behaving alternative to allowing outer types
499    /// full mutable access to the core `Vec`.
500    #[inline]
501    pub(crate) fn extend_from_slice(&mut self, other: &[u8]) {
502        self.bytes.extend_from_slice(other);
503        self.is_known_utf8 = false;
504    }
505}
506
507/// Creates a new WTF-8 string from an iterator of code points.
508///
509/// This replaces surrogate code point pairs with supplementary code points,
510/// like concatenating ill-formed UTF-16 strings effectively would.
511impl FromIterator<CodePoint> for Wtf8Buf {
512    fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
513        let mut string = Wtf8Buf::new();
514        string.extend(iter);
515        string
516    }
517}
518
519/// Append code points from an iterator to the string.
520///
521/// This replaces surrogate code point pairs with supplementary code points,
522/// like concatenating ill-formed UTF-16 strings effectively would.
523impl Extend<CodePoint> for Wtf8Buf {
524    fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
525        let iterator = iter.into_iter();
526        let (low, _high) = iterator.size_hint();
527        // Lower bound of one byte per code point (ASCII only)
528        self.bytes.reserve(low);
529        iterator.for_each(move |code_point| self.push(code_point));
530    }
531
532    #[inline]
533    fn extend_one(&mut self, code_point: CodePoint) {
534        self.push(code_point);
535    }
536
537    #[inline]
538    fn extend_reserve(&mut self, additional: usize) {
539        // Lower bound of one byte per code point (ASCII only)
540        self.bytes.reserve(additional);
541    }
542}
543
544/// A borrowed slice of well-formed WTF-8 data.
545///
546/// Similar to `&str`, but can additionally contain surrogate code points
547/// if they’re not in a surrogate pair.
548#[derive(Eq, Ord, PartialEq, PartialOrd)]
549#[repr(transparent)]
550pub struct Wtf8 {
551    bytes: [u8],
552}
553
554impl AsInner<[u8]> for Wtf8 {
555    #[inline]
556    fn as_inner(&self) -> &[u8] {
557        &self.bytes
558    }
559}
560
561/// Formats the string in double quotes, with characters escaped according to
562/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
563/// where each `x` is a hexadecimal digit.
564impl fmt::Debug for Wtf8 {
565    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
566        fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
567            use crate::fmt::Write;
568            for c in s.chars().flat_map(|c| c.escape_debug()) {
569                f.write_char(c)?
570            }
571            Ok(())
572        }
573
574        formatter.write_str("\"")?;
575        let mut pos = 0;
576        while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
577            write_str_escaped(formatter, unsafe {
578                str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
579            })?;
580            write!(formatter, "\\u{{{:x}}}", surrogate)?;
581            pos = surrogate_pos + 3;
582        }
583        write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
584        formatter.write_str("\"")
585    }
586}
587
588/// Formats the string with unpaired surrogates substituted with the replacement
589/// character, U+FFFD.
590impl fmt::Display for Wtf8 {
591    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
592        let wtf8_bytes = &self.bytes;
593        let mut pos = 0;
594        loop {
595            match self.next_surrogate(pos) {
596                Some((surrogate_pos, _)) => {
597                    formatter.write_str(unsafe {
598                        str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
599                    })?;
600                    formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
601                    pos = surrogate_pos + 3;
602                }
603                None => {
604                    let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
605                    if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
606                }
607            }
608        }
609    }
610}
611
612impl Wtf8 {
613    /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
614    ///
615    /// Since WTF-8 is a superset of UTF-8, this always succeeds.
616    #[inline]
617    pub fn from_str(value: &str) -> &Wtf8 {
618        unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
619    }
620
621    /// Creates a WTF-8 slice from a WTF-8 byte slice.
622    ///
623    /// Since the byte slice is not checked for valid WTF-8, this functions is
624    /// marked unsafe.
625    #[inline]
626    pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
627        // SAFETY: start with &[u8], end with fancy &[u8]
628        unsafe { &*(value as *const [u8] as *const Wtf8) }
629    }
630
631    /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
632    ///
633    /// Since the byte slice is not checked for valid WTF-8, this functions is
634    /// marked unsafe.
635    #[inline]
636    unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
637        // SAFETY: start with &mut [u8], end with fancy &mut [u8]
638        unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
639    }
640
641    /// Returns the length, in WTF-8 bytes.
642    #[inline]
643    pub fn len(&self) -> usize {
644        self.bytes.len()
645    }
646
647    #[inline]
648    pub fn is_empty(&self) -> bool {
649        self.bytes.is_empty()
650    }
651
652    /// Returns the code point at `position` if it is in the ASCII range,
653    /// or `b'\xFF'` otherwise.
654    ///
655    /// # Panics
656    ///
657    /// Panics if `position` is beyond the end of the string.
658    #[inline]
659    pub fn ascii_byte_at(&self, position: usize) -> u8 {
660        match self.bytes[position] {
661            ascii_byte @ 0x00..=0x7F => ascii_byte,
662            _ => 0xFF,
663        }
664    }
665
666    /// Returns an iterator for the string’s code points.
667    #[inline]
668    pub fn code_points(&self) -> Wtf8CodePoints<'_> {
669        Wtf8CodePoints { bytes: self.bytes.iter() }
670    }
671
672    /// Access raw bytes of WTF-8 data
673    #[inline]
674    pub fn as_bytes(&self) -> &[u8] {
675        &self.bytes
676    }
677
678    /// Tries to convert the string to UTF-8 and return a `&str` slice.
679    ///
680    /// Returns `None` if the string contains surrogates.
681    ///
682    /// This does not copy the data.
683    #[inline]
684    pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
685        str::from_utf8(&self.bytes)
686    }
687
688    /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
689    pub fn to_owned(&self) -> Wtf8Buf {
690        Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false }
691    }
692
693    /// Lossily converts the string to UTF-8.
694    /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
695    ///
696    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
697    ///
698    /// This only copies the data if necessary (if it contains any surrogate).
699    pub fn to_string_lossy(&self) -> Cow<'_, str> {
700        let Some((surrogate_pos, _)) = self.next_surrogate(0) else {
701            return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) });
702        };
703        let wtf8_bytes = &self.bytes;
704        let mut utf8_bytes = Vec::with_capacity(self.len());
705        utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
706        utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
707        let mut pos = surrogate_pos + 3;
708        loop {
709            match self.next_surrogate(pos) {
710                Some((surrogate_pos, _)) => {
711                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
712                    utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
713                    pos = surrogate_pos + 3;
714                }
715                None => {
716                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
717                    return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
718                }
719            }
720        }
721    }
722
723    /// Converts the WTF-8 string to potentially ill-formed UTF-16
724    /// and return an iterator of 16-bit code units.
725    ///
726    /// This is lossless:
727    /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
728    /// would always return the original WTF-8 string.
729    #[inline]
730    pub fn encode_wide(&self) -> EncodeWide<'_> {
731        EncodeWide { code_points: self.code_points(), extra: 0 }
732    }
733
734    #[inline]
735    fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
736        let mut iter = self.bytes[pos..].iter();
737        loop {
738            let b = *iter.next()?;
739            if b < 0x80 {
740                pos += 1;
741            } else if b < 0xE0 {
742                iter.next();
743                pos += 2;
744            } else if b == 0xED {
745                match (iter.next(), iter.next()) {
746                    (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
747                        return Some((pos, decode_surrogate(b2, b3)));
748                    }
749                    _ => pos += 3,
750                }
751            } else if b < 0xF0 {
752                iter.next();
753                iter.next();
754                pos += 3;
755            } else {
756                iter.next();
757                iter.next();
758                iter.next();
759                pos += 4;
760            }
761        }
762    }
763
764    #[inline]
765    fn final_lead_surrogate(&self) -> Option<u16> {
766        match self.bytes {
767            [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
768            _ => None,
769        }
770    }
771
772    #[inline]
773    fn initial_trail_surrogate(&self) -> Option<u16> {
774        match self.bytes {
775            [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
776            _ => None,
777        }
778    }
779
780    pub fn clone_into(&self, buf: &mut Wtf8Buf) {
781        buf.is_known_utf8 = false;
782        self.bytes.clone_into(&mut buf.bytes);
783    }
784
785    /// Boxes this `Wtf8`.
786    #[inline]
787    pub fn into_box(&self) -> Box<Wtf8> {
788        let boxed: Box<[u8]> = self.bytes.into();
789        unsafe { mem::transmute(boxed) }
790    }
791
792    /// Creates a boxed, empty `Wtf8`.
793    pub fn empty_box() -> Box<Wtf8> {
794        let boxed: Box<[u8]> = Default::default();
795        unsafe { mem::transmute(boxed) }
796    }
797
798    #[inline]
799    pub fn into_arc(&self) -> Arc<Wtf8> {
800        let arc: Arc<[u8]> = Arc::from(&self.bytes);
801        unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
802    }
803
804    #[inline]
805    pub fn into_rc(&self) -> Rc<Wtf8> {
806        let rc: Rc<[u8]> = Rc::from(&self.bytes);
807        unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
808    }
809
810    #[inline]
811    pub fn make_ascii_lowercase(&mut self) {
812        self.bytes.make_ascii_lowercase()
813    }
814
815    #[inline]
816    pub fn make_ascii_uppercase(&mut self) {
817        self.bytes.make_ascii_uppercase()
818    }
819
820    #[inline]
821    pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
822        Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false }
823    }
824
825    #[inline]
826    pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
827        Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false }
828    }
829
830    #[inline]
831    pub fn is_ascii(&self) -> bool {
832        self.bytes.is_ascii()
833    }
834
835    #[inline]
836    pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
837        self.bytes.eq_ignore_ascii_case(&other.bytes)
838    }
839}
840
841/// Returns a slice of the given string for the byte range \[`begin`..`end`).
842///
843/// # Panics
844///
845/// Panics when `begin` and `end` do not point to code point boundaries,
846/// or point beyond the end of the string.
847impl ops::Index<ops::Range<usize>> for Wtf8 {
848    type Output = Wtf8;
849
850    #[inline]
851    fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
852        // is_code_point_boundary checks that the index is in [0, .len()]
853        if range.start <= range.end
854            && is_code_point_boundary(self, range.start)
855            && is_code_point_boundary(self, range.end)
856        {
857            unsafe { slice_unchecked(self, range.start, range.end) }
858        } else {
859            slice_error_fail(self, range.start, range.end)
860        }
861    }
862}
863
864/// Returns a slice of the given string from byte `begin` to its end.
865///
866/// # Panics
867///
868/// Panics when `begin` is not at a code point boundary,
869/// or is beyond the end of the string.
870impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
871    type Output = Wtf8;
872
873    #[inline]
874    fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
875        // is_code_point_boundary checks that the index is in [0, .len()]
876        if is_code_point_boundary(self, range.start) {
877            unsafe { slice_unchecked(self, range.start, self.len()) }
878        } else {
879            slice_error_fail(self, range.start, self.len())
880        }
881    }
882}
883
884/// Returns a slice of the given string from its beginning to byte `end`.
885///
886/// # Panics
887///
888/// Panics when `end` is not at a code point boundary,
889/// or is beyond the end of the string.
890impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
891    type Output = Wtf8;
892
893    #[inline]
894    fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
895        // is_code_point_boundary checks that the index is in [0, .len()]
896        if is_code_point_boundary(self, range.end) {
897            unsafe { slice_unchecked(self, 0, range.end) }
898        } else {
899            slice_error_fail(self, 0, range.end)
900        }
901    }
902}
903
904impl ops::Index<ops::RangeFull> for Wtf8 {
905    type Output = Wtf8;
906
907    #[inline]
908    fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
909        self
910    }
911}
912
913#[inline]
914fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
915    // The first byte is assumed to be 0xED
916    0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
917}
918
919#[inline]
920fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
921    let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
922    unsafe { char::from_u32_unchecked(code_point) }
923}
924
925/// Copied from str::is_char_boundary
926#[inline]
927pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
928    if index == 0 {
929        return true;
930    }
931    match slice.bytes.get(index) {
932        None => index == slice.len(),
933        Some(&b) => (b as i8) >= -0x40,
934    }
935}
936
937/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
938/// (i.e. a codepoint that's not a surrogate) or of the whole string.
939///
940/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`.
941/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
942/// we do not permit it in the public API because WTF-8 is considered an
943/// implementation detail.
944#[track_caller]
945#[inline]
946pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
947    if index == 0 {
948        return;
949    }
950    match slice.bytes.get(index) {
951        Some(0xED) => (), // Might be a surrogate
952        Some(&b) if (b as i8) >= -0x40 => return,
953        Some(_) => panic!("byte index {index} is not a codepoint boundary"),
954        None if index == slice.len() => return,
955        None => panic!("byte index {index} is out of bounds"),
956    }
957    if slice.bytes[index + 1] >= 0xA0 {
958        // There's a surrogate after index. Now check before index.
959        if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 {
960            panic!("byte index {index} lies between surrogate codepoints");
961        }
962    }
963}
964
965/// Copied from core::str::raw::slice_unchecked
966#[inline]
967pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
968    // SAFETY: memory layout of a &[u8] and &Wtf8 are the same
969    unsafe {
970        let len = end - begin;
971        let start = s.as_bytes().as_ptr().add(begin);
972        Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
973    }
974}
975
976/// Copied from core::str::raw::slice_error_fail
977#[inline(never)]
978pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
979    assert!(begin <= end);
980    panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
981}
982
983/// Iterator for the code points of a WTF-8 string.
984///
985/// Created with the method `.code_points()`.
986#[derive(Clone)]
987pub struct Wtf8CodePoints<'a> {
988    bytes: slice::Iter<'a, u8>,
989}
990
991impl Iterator for Wtf8CodePoints<'_> {
992    type Item = CodePoint;
993
994    #[inline]
995    fn next(&mut self) -> Option<CodePoint> {
996        // SAFETY: `self.bytes` has been created from a WTF-8 string
997        unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
998    }
999
1000    #[inline]
1001    fn size_hint(&self) -> (usize, Option<usize>) {
1002        let len = self.bytes.len();
1003        (len.saturating_add(3) / 4, Some(len))
1004    }
1005}
1006
1007/// Generates a wide character sequence for potentially ill-formed UTF-16.
1008#[stable(feature = "rust1", since = "1.0.0")]
1009#[derive(Clone)]
1010pub struct EncodeWide<'a> {
1011    code_points: Wtf8CodePoints<'a>,
1012    extra: u16,
1013}
1014
1015// Copied from libunicode/u_str.rs
1016#[stable(feature = "rust1", since = "1.0.0")]
1017impl Iterator for EncodeWide<'_> {
1018    type Item = u16;
1019
1020    #[inline]
1021    fn next(&mut self) -> Option<u16> {
1022        if self.extra != 0 {
1023            let tmp = self.extra;
1024            self.extra = 0;
1025            return Some(tmp);
1026        }
1027
1028        let mut buf = [0; MAX_LEN_UTF16];
1029        self.code_points.next().map(|code_point| {
1030            let n = encode_utf16_raw(code_point.value, &mut buf).len();
1031            if n == 2 {
1032                self.extra = buf[1];
1033            }
1034            buf[0]
1035        })
1036    }
1037
1038    #[inline]
1039    fn size_hint(&self) -> (usize, Option<usize>) {
1040        let (low, high) = self.code_points.size_hint();
1041        let ext = (self.extra != 0) as usize;
1042        // every code point gets either one u16 or two u16,
1043        // so this iterator is between 1 or 2 times as
1044        // long as the underlying iterator.
1045        (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext)))
1046    }
1047}
1048
1049#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
1050impl FusedIterator for EncodeWide<'_> {}
1051
1052impl Hash for CodePoint {
1053    #[inline]
1054    fn hash<H: Hasher>(&self, state: &mut H) {
1055        self.value.hash(state)
1056    }
1057}
1058
1059impl Hash for Wtf8Buf {
1060    #[inline]
1061    fn hash<H: Hasher>(&self, state: &mut H) {
1062        state.write(&self.bytes);
1063        0xfeu8.hash(state)
1064    }
1065}
1066
1067impl Hash for Wtf8 {
1068    #[inline]
1069    fn hash<H: Hasher>(&self, state: &mut H) {
1070        state.write(&self.bytes);
1071        0xfeu8.hash(state)
1072    }
1073}
1074
1075#[unstable(feature = "clone_to_uninit", issue = "126799")]
1076unsafe impl CloneToUninit for Wtf8 {
1077    #[inline]
1078    #[cfg_attr(debug_assertions, track_caller)]
1079    unsafe fn clone_to_uninit(&self, dst: *mut u8) {
1080        // SAFETY: we're just a transparent wrapper around [u8]
1081        unsafe { self.bytes.clone_to_uninit(dst) }
1082    }
1083}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy