core/char/
decode.rs

1//! UTF-8 and UTF-16 decoding iterators
2
3use crate::error::Error;
4use crate::fmt;
5use crate::iter::FusedIterator;
6
7/// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
8///
9/// This `struct` is created by the [`decode_utf16`] method on [`char`]. See its
10/// documentation for more.
11///
12/// [`decode_utf16`]: char::decode_utf16
13#[stable(feature = "decode_utf16", since = "1.9.0")]
14#[derive(Clone, Debug)]
15pub struct DecodeUtf16<I>
16where
17    I: Iterator<Item = u16>,
18{
19    iter: I,
20    buf: Option<u16>,
21}
22
23/// An error that can be returned when decoding UTF-16 code points.
24///
25/// This `struct` is created when using the [`DecodeUtf16`] type.
26#[stable(feature = "decode_utf16", since = "1.9.0")]
27#[derive(Debug, Clone, Eq, PartialEq)]
28pub struct DecodeUtf16Error {
29    code: u16,
30}
31
32/// Creates an iterator over the UTF-16 encoded code points in `iter`,
33/// returning unpaired surrogates as `Err`s. See [`char::decode_utf16`].
34#[inline]
35pub(super) fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
36    DecodeUtf16 { iter: iter.into_iter(), buf: None }
37}
38
39#[stable(feature = "decode_utf16", since = "1.9.0")]
40impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
41    type Item = Result<char, DecodeUtf16Error>;
42
43    fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> {
44        let u = match self.buf.take() {
45            Some(buf) => buf,
46            None => self.iter.next()?,
47        };
48
49        if !u.is_utf16_surrogate() {
50            // SAFETY: not a surrogate
51            Some(Ok(unsafe { char::from_u32_unchecked(u as u32) }))
52        } else if u >= 0xDC00 {
53            // a trailing surrogate
54            Some(Err(DecodeUtf16Error { code: u }))
55        } else {
56            let u2 = match self.iter.next() {
57                Some(u2) => u2,
58                // eof
59                None => return Some(Err(DecodeUtf16Error { code: u })),
60            };
61            if u2 < 0xDC00 || u2 > 0xDFFF {
62                // not a trailing surrogate so we're not a valid
63                // surrogate pair, so rewind to redecode u2 next time.
64                self.buf = Some(u2);
65                return Some(Err(DecodeUtf16Error { code: u }));
66            }
67
68            // all ok, so lets decode it.
69            let c = (((u & 0x3ff) as u32) << 10 | (u2 & 0x3ff) as u32) + 0x1_0000;
70            // SAFETY: we checked that it's a legal unicode value
71            Some(Ok(unsafe { char::from_u32_unchecked(c) }))
72        }
73    }
74
75    #[inline]
76    fn size_hint(&self) -> (usize, Option<usize>) {
77        let (low, high) = self.iter.size_hint();
78
79        let (low_buf, high_buf) = match self.buf {
80            // buf is empty, no additional elements from it.
81            None => (0, 0),
82            // `u` is a non surrogate, so it's always an additional character.
83            Some(u) if !u.is_utf16_surrogate() => (1, 1),
84            // `u` is a leading surrogate (it can never be a trailing surrogate and
85            // it's a surrogate due to the previous branch) and `self.iter` is empty.
86            //
87            // `u` can't be paired, since the `self.iter` is empty,
88            // so it will always become an additional element (error).
89            Some(_u) if high == Some(0) => (1, 1),
90            // `u` is a leading surrogate and `iter` may be non-empty.
91            //
92            // `u` can either pair with a trailing surrogate, in which case no additional elements
93            // are produced, or it can become an error, in which case it's an additional character (error).
94            Some(_u) => (0, 1),
95        };
96
97        // `self.iter` could contain entirely valid surrogates (2 elements per
98        // char), or entirely non-surrogates (1 element per char).
99        //
100        // On odd lower bound, at least one element must stay unpaired
101        // (with other elements from `self.iter`), so we round up.
102        let low = low.div_ceil(2) + low_buf;
103        let high = high.and_then(|h| h.checked_add(high_buf));
104
105        (low, high)
106    }
107}
108
109#[stable(feature = "decode_utf16_fused_iterator", since = "1.75.0")]
110impl<I: Iterator<Item = u16> + FusedIterator> FusedIterator for DecodeUtf16<I> {}
111
112impl DecodeUtf16Error {
113    /// Returns the unpaired surrogate which caused this error.
114    #[must_use]
115    #[stable(feature = "decode_utf16", since = "1.9.0")]
116    pub fn unpaired_surrogate(&self) -> u16 {
117        self.code
118    }
119}
120
121#[stable(feature = "decode_utf16", since = "1.9.0")]
122impl fmt::Display for DecodeUtf16Error {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        write!(f, "unpaired surrogate found: {:x}", self.code)
125    }
126}
127
128#[stable(feature = "decode_utf16", since = "1.9.0")]
129impl Error for DecodeUtf16Error {
130    #[allow(deprecated)]
131    fn description(&self) -> &str {
132        "unpaired surrogate found"
133    }
134}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy