Skip to content

Commit 0589ae0

Browse files
authored
unicode decoding (#1854)
1 parent 82f7e32 commit 0589ae0

File tree

3 files changed

+289
-6
lines changed

3 files changed

+289
-6
lines changed

detect/codec/decoder_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@ package codec
22

33
import (
44
"encoding/hex"
5-
"github.com/stretchr/testify/assert"
65
"net/url"
76
"testing"
7+
8+
"github.com/stretchr/testify/assert"
89
)
910

1011
func TestDecode(t *testing.T) {
@@ -90,6 +91,21 @@ func TestDecode(t *testing.T) {
9091
chunk: `secret="466973684D617048756E6B79212121363334"`,
9192
expected: `secret="FishMapHunky!!!634"`,
9293
},
94+
{
95+
name: "unicode encoded value",
96+
chunk: `secret=U+0061 U+0062 U+0063 U+0064 U+0065 U+0066`,
97+
expected: "secret=abcdef",
98+
},
99+
{
100+
name: "unicode encoded value backslashed",
101+
chunk: `secret=\\u0068\\u0065\\u006c\\u006c\\u006f\\u0020\\u0077\\u006f\\u0072\\u006c\\u0064\\u0020\\u0064\\u0075\\u0064\\u0065`,
102+
expected: "secret=hello world dude",
103+
},
104+
{
105+
name: "unicode encoded value backslashed mixed w/ hex",
106+
chunk: `secret=\u0068\u0065\u006c\u006c\u006f\u0020\u0077\u006f\u0072\u006c\u0064 6C6F76656C792070656F706C65206F66206561727468`,
107+
expected: "secret=hello world lovely people of earth",
108+
},
93109
}
94110

95111
decoder := NewDecoder()

detect/codec/encodings.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,22 @@ var (
1919
// (e.g. base64). If two encoding matches overlap the decoder will use
2020
// this order to determine which encoding should wait till the next pass.
2121
encodings = []*encoding{
22-
&encoding{
22+
{
2323
kind: percentKind,
2424
pattern: `%[0-9A-Fa-f]{2}(?:.*%[0-9A-Fa-f]{2})?`,
2525
decode: decodePercent,
2626
},
27-
&encoding{
27+
{
28+
kind: unicodeKind,
29+
pattern: `(?:(?:U\+[a-fA-F0-9]{4}(?:\s|$))+|(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+)`,
30+
decode: decodeUnicode,
31+
},
32+
{
2833
kind: hexKind,
2934
pattern: `[0-9A-Fa-f]{32,}`,
3035
decode: decodeHex,
3136
},
32-
&encoding{
37+
{
3338
kind: base64Kind,
3439
pattern: `[\w\/+-]{16,}={0,2}`,
3540
decode: decodeBase64,
@@ -40,6 +45,7 @@ var (
4045
// encodingNames is used to map the encodingKinds to their name
4146
var encodingNames = []string{
4247
"percent",
48+
"unicode",
4349
"hex",
4450
"base64",
4551
}
@@ -51,8 +57,9 @@ type encodingKind int
5157
var (
5258
// make sure these go up by powers of 2
5359
percentKind = encodingKind(1)
54-
hexKind = encodingKind(2)
55-
base64Kind = encodingKind(4)
60+
unicodeKind = encodingKind(2)
61+
hexKind = encodingKind(4)
62+
base64Kind = encodingKind(8)
5663
)
5764

5865
func (e encodingKind) String() string {

detect/codec/unicode.go

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
package codec
2+
3+
import (
4+
"bytes"
5+
"regexp"
6+
"strconv"
7+
"strings"
8+
"unicode/utf8"
9+
)
10+
11+
var (
12+
// Standard Unicode notation (e.g., U+1234)
13+
unicodeCodePointPat = regexp.MustCompile(`U\+([a-fA-F0-9]{4}).?`)
14+
15+
// Multiple code points pattern - used for continuous sequences like "U+0074 U+006F U+006B..."
16+
unicodeMultiCodePointPat = regexp.MustCompile(`(?:U\+[a-fA-F0-9]{4}(?:\s|$))+`)
17+
18+
// Common escape sequence used in programming languages (e.g., \u1234)
19+
unicodeEscapePat = regexp.MustCompile(`(?i)\\{1,2}u([a-fA-F0-9]{4})`)
20+
21+
// Multiple escape sequences pattern - used for continuous sequences like "\u0074\u006F\u006B..."
22+
unicodeMultiEscapePat = regexp.MustCompile(`(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+`)
23+
)
24+
25+
// Unicode characters are encoded as 1 to 4 bytes per rune.
26+
const maxBytesPerRune = 4
27+
28+
// decodeUnicode decodes Unicode escape sequences in the given string
29+
func decodeUnicode(encodedValue string) string {
30+
// First, check if we have a continuous sequence of Unicode code points
31+
if matches := unicodeMultiCodePointPat.FindAllString(encodedValue, -1); len(matches) > 0 {
32+
// For each detected sequence of code points
33+
for _, match := range matches {
34+
// Decode the entire sequence at once
35+
decodedSequence := decodeMultiCodePoint(match)
36+
37+
// If we successfully decoded something, replace it in the original string
38+
if decodedSequence != "" && decodedSequence != match {
39+
encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
40+
}
41+
}
42+
return encodedValue
43+
}
44+
45+
// Next, check if we have a continuous sequence of escape sequences
46+
if matches := unicodeMultiEscapePat.FindAllString(encodedValue, -1); len(matches) > 0 {
47+
// For each detected sequence of escape sequences
48+
for _, match := range matches {
49+
// Decode the entire sequence at once
50+
decodedSequence := decodeMultiEscape(match)
51+
52+
// If we successfully decoded something, replace it in the original string
53+
if decodedSequence != "" && decodedSequence != match {
54+
encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
55+
}
56+
}
57+
return encodedValue
58+
}
59+
60+
// If no multi-patterns were matched, fall back to the original implementation
61+
// for individual code points and escape sequences
62+
63+
// Create a copy of the input to work with
64+
data := []byte(encodedValue)
65+
66+
// Store the result
67+
var result []byte
68+
69+
// Check and decode Unicode code points (U+1234 format)
70+
if unicodeCodePointPat.Match(data) {
71+
result = decodeIndividualCodePoints(data)
72+
}
73+
74+
// If no code points were found or we have a mix of formats,
75+
// also check for Unicode escape sequences (\u1234 format)
76+
if len(result) == 0 || unicodeEscapePat.Match(data) {
77+
// If we already have some result from code point decoding,
78+
// continue decoding escape sequences on that result
79+
if len(result) > 0 {
80+
result = decodeIndividualEscapes(result)
81+
} else {
82+
result = decodeIndividualEscapes(data)
83+
}
84+
}
85+
86+
// If nothing was decoded, return original string
87+
if len(result) == 0 || bytes.Equal(result, data) {
88+
return encodedValue
89+
}
90+
91+
return string(result)
92+
}
93+
94+
// decodeMultiCodePoint decodes a continuous sequence of Unicode code points (U+XXXX format)
95+
func decodeMultiCodePoint(sequence string) string {
96+
// If the sequence is empty, return empty string
97+
if sequence == "" {
98+
return ""
99+
}
100+
101+
// Split the sequence by whitespace to get individual code points
102+
codePoints := strings.Fields(sequence)
103+
if len(codePoints) == 0 {
104+
return sequence
105+
}
106+
107+
// Decode each code point and build the result
108+
var decodedRunes []rune
109+
for _, cp := range codePoints {
110+
// Check if it follows the U+XXXX pattern
111+
if !strings.HasPrefix(cp, "U+") || len(cp) < 6 {
112+
continue
113+
}
114+
115+
// Extract the hexadecimal value
116+
hexValue := cp[2:]
117+
118+
// Parse the hexadecimal value to an integer
119+
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
120+
if err != nil {
121+
continue
122+
}
123+
124+
// Convert to rune and add to result
125+
decodedRunes = append(decodedRunes, rune(unicodeInt))
126+
}
127+
128+
// If we didn't decode anything, return the original sequence
129+
if len(decodedRunes) == 0 {
130+
return sequence
131+
}
132+
133+
// Return the decoded string
134+
return string(decodedRunes)
135+
}
136+
137+
// decodeMultiEscape decodes a continuous sequence of Unicode escape sequences (\uXXXX format)
138+
func decodeMultiEscape(sequence string) string {
139+
// If the sequence is empty, return empty string
140+
if sequence == "" {
141+
return ""
142+
}
143+
144+
// Find all escape sequences
145+
escapes := unicodeEscapePat.FindAllStringSubmatch(sequence, -1)
146+
if len(escapes) == 0 {
147+
return sequence
148+
}
149+
150+
// Decode each escape sequence and build the result
151+
var decodedRunes []rune
152+
for _, esc := range escapes {
153+
// Extract the hexadecimal value
154+
hexValue := esc[1]
155+
156+
// Parse the hexadecimal value to an integer
157+
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
158+
if err != nil {
159+
continue
160+
}
161+
162+
// Convert to rune and add to result
163+
decodedRunes = append(decodedRunes, rune(unicodeInt))
164+
}
165+
166+
// If we didn't decode anything, return the original sequence
167+
if len(decodedRunes) == 0 {
168+
return sequence
169+
}
170+
171+
// Return the decoded string
172+
return string(decodedRunes)
173+
}
174+
175+
// decodeIndividualCodePoints decodes individual Unicode code points (U+1234 format)
176+
// This is a fallback for when we don't have a continuous sequence of code points
177+
func decodeIndividualCodePoints(input []byte) []byte {
178+
// Find all Unicode code point sequences in the input byte slice
179+
indices := unicodeCodePointPat.FindAllSubmatchIndex(input, -1)
180+
181+
// If none found, return original input
182+
if len(indices) == 0 {
183+
return input
184+
}
185+
186+
// Iterate over found indices in reverse order to avoid modifying the slice length
187+
utf8Bytes := make([]byte, maxBytesPerRune)
188+
for i := len(indices) - 1; i >= 0; i-- {
189+
matches := indices[i]
190+
191+
startIndex := matches[0]
192+
endIndex := matches[1]
193+
hexStartIndex := matches[2]
194+
hexEndIndex := matches[3]
195+
196+
// If the input is like `U+1234 U+5678` we should replace `U+1234 `.
197+
// Otherwise, we should only replace `U+1234`.
198+
if endIndex != hexEndIndex && endIndex < len(input) && input[endIndex-1] == ' ' {
199+
endIndex = endIndex - 1
200+
}
201+
202+
// Extract the hexadecimal value from the escape sequence
203+
hexValue := string(input[hexStartIndex:hexEndIndex])
204+
205+
// Parse the hexadecimal value to an integer
206+
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
207+
if err != nil {
208+
// If there's an error, continue to the next escape sequence
209+
continue
210+
}
211+
212+
// Convert the Unicode code point to a UTF-8 representation
213+
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
214+
215+
// Replace the escape sequence with the UTF-8 representation
216+
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
217+
}
218+
219+
return input
220+
}
221+
222+
// decodeIndividualEscapes decodes individual Unicode escape sequences (\u1234 format)
223+
// This is a fallback for when we don't have a continuous sequence of escape sequences
224+
func decodeIndividualEscapes(input []byte) []byte {
225+
// Find all Unicode escape sequences in the input byte slice
226+
indices := unicodeEscapePat.FindAllSubmatchIndex(input, -1)
227+
228+
// If none found, return original input
229+
if len(indices) == 0 {
230+
return input
231+
}
232+
233+
// Iterate over found indices in reverse order to avoid modifying the slice length
234+
utf8Bytes := make([]byte, maxBytesPerRune)
235+
for i := len(indices) - 1; i >= 0; i-- {
236+
matches := indices[i]
237+
238+
startIndex := matches[0]
239+
hexStartIndex := matches[2]
240+
endIndex := matches[3]
241+
242+
// Extract the hexadecimal value from the escape sequence
243+
hexValue := string(input[hexStartIndex:endIndex])
244+
245+
// Parse the hexadecimal value to an integer
246+
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
247+
if err != nil {
248+
// If there's an error, continue to the next escape sequence
249+
continue
250+
}
251+
252+
// Convert the Unicode code point to a UTF-8 representation
253+
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
254+
255+
// Replace the escape sequence with the UTF-8 representation
256+
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
257+
}
258+
259+
return input
260+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy