unicode decoding (#1854)

zricethezav · web-flow · commit 0589ae029840 · 2025-05-14T10:14:31.000-05:00
diff --git a/detect/codec/decoder_test.go b/detect/codec/decoder_test.go
@@ -2,9 +2,10 @@ package codec
 
 import (
 	"encoding/hex"
-	"github.com/stretchr/testify/assert"
 	"net/url"
 	"testing"
+
+	"github.com/stretchr/testify/assert"
 )
 
 func TestDecode(t *testing.T) {
@@ -90,6 +91,21 @@ func TestDecode(t *testing.T) {
 			chunk:    `secret="466973684D617048756E6B79212121363334"`,
 			expected: `secret="FishMapHunky!!!634"`,
 		},
+		{
+			name:     "unicode encoded value",
+			chunk:    `secret=U+0061 U+0062 U+0063 U+0064 U+0065 U+0066`,
+			expected: "secret=abcdef",
+		},
+		{
+			name:     "unicode encoded value backslashed",
+			chunk:    `secret=\\u0068\\u0065\\u006c\\u006c\\u006f\\u0020\\u0077\\u006f\\u0072\\u006c\\u0064\\u0020\\u0064\\u0075\\u0064\\u0065`,
+			expected: "secret=hello world dude",
+		},
+		{
+			name:     "unicode encoded value backslashed mixed w/ hex",
+			chunk:    `secret=\u0068\u0065\u006c\u006c\u006f\u0020\u0077\u006f\u0072\u006c\u0064 6C6F76656C792070656F706C65206F66206561727468`,
+			expected: "secret=hello world lovely people of earth",
+		},
 	}
 
 	decoder := NewDecoder()
diff --git a/detect/codec/encodings.go b/detect/codec/encodings.go
@@ -19,17 +19,22 @@ var (
 	// (e.g. base64). If two encoding matches overlap the decoder will use
 	// this order to determine which encoding should wait till the next pass.
 	encodings = []*encoding{
-		&encoding{
+		{
 			kind:    percentKind,
 			pattern: `%[0-9A-Fa-f]{2}(?:.*%[0-9A-Fa-f]{2})?`,
 			decode:  decodePercent,
 		},
-		&encoding{
+		{
+			kind:    unicodeKind,
+			pattern: `(?:(?:U\+[a-fA-F0-9]{4}(?:\s|$))+|(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+)`,
+			decode:  decodeUnicode,
+		},
+		{
 			kind:    hexKind,
 			pattern: `[0-9A-Fa-f]{32,}`,
 			decode:  decodeHex,
 		},
-		&encoding{
+		{
 			kind:    base64Kind,
 			pattern: `[\w\/+-]{16,}={0,2}`,
 			decode:  decodeBase64,
@@ -40,6 +45,7 @@ var (
 // encodingNames is used to map the encodingKinds to their name
 var encodingNames = []string{
 	"percent",
+	"unicode",
 	"hex",
 	"base64",
 }
@@ -51,8 +57,9 @@ type encodingKind int
 var (
 	// make sure these go up by powers of 2
 	percentKind = encodingKind(1)
-	hexKind     = encodingKind(2)
-	base64Kind  = encodingKind(4)
+	unicodeKind = encodingKind(2)
+	hexKind     = encodingKind(4)
+	base64Kind  = encodingKind(8)
 )
 
 func (e encodingKind) String() string {
diff --git a/detect/codec/unicode.go b/detect/codec/unicode.go
@@ -0,0 +1,260 @@
+package codec
+
+import (
+	"bytes"
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+var (
+	// Standard Unicode notation (e.g., U+1234)
+	unicodeCodePointPat = regexp.MustCompile(`U\+([a-fA-F0-9]{4}).?`)
+
+	// Multiple code points pattern - used for continuous sequences like "U+0074 U+006F U+006B..."
+	unicodeMultiCodePointPat = regexp.MustCompile(`(?:U\+[a-fA-F0-9]{4}(?:\s|$))+`)
+
+	// Common escape sequence used in programming languages (e.g., \u1234)
+	unicodeEscapePat = regexp.MustCompile(`(?i)\\{1,2}u([a-fA-F0-9]{4})`)
+
+	// Multiple escape sequences pattern - used for continuous sequences like "\u0074\u006F\u006B..."
+	unicodeMultiEscapePat = regexp.MustCompile(`(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+`)
+)
+
+// Unicode characters are encoded as 1 to 4 bytes per rune.
+const maxBytesPerRune = 4
+
+// decodeUnicode decodes Unicode escape sequences in the given string
+func decodeUnicode(encodedValue string) string {
+	// First, check if we have a continuous sequence of Unicode code points
+	if matches := unicodeMultiCodePointPat.FindAllString(encodedValue, -1); len(matches) > 0 {
+		// For each detected sequence of code points
+		for _, match := range matches {
+			// Decode the entire sequence at once
+			decodedSequence := decodeMultiCodePoint(match)
+
+			// If we successfully decoded something, replace it in the original string
+			if decodedSequence != "" && decodedSequence != match {
+				encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
+			}
+		}
+		return encodedValue
+	}
+
+	// Next, check if we have a continuous sequence of escape sequences
+	if matches := unicodeMultiEscapePat.FindAllString(encodedValue, -1); len(matches) > 0 {
+		// For each detected sequence of escape sequences
+		for _, match := range matches {
+			// Decode the entire sequence at once
+			decodedSequence := decodeMultiEscape(match)
+
+			// If we successfully decoded something, replace it in the original string
+			if decodedSequence != "" && decodedSequence != match {
+				encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
+			}
+		}
+		return encodedValue
+	}
+
+	// If no multi-patterns were matched, fall back to the original implementation
+	// for individual code points and escape sequences
+
+	// Create a copy of the input to work with
+	data := []byte(encodedValue)
+
+	// Store the result
+	var result []byte
+
+	// Check and decode Unicode code points (U+1234 format)
+	if unicodeCodePointPat.Match(data) {
+		result = decodeIndividualCodePoints(data)
+	}
+
+	// If no code points were found or we have a mix of formats,
+	// also check for Unicode escape sequences (\u1234 format)
+	if len(result) == 0 || unicodeEscapePat.Match(data) {
+		// If we already have some result from code point decoding,
+		// continue decoding escape sequences on that result
+		if len(result) > 0 {
+			result = decodeIndividualEscapes(result)
+		} else {
+			result = decodeIndividualEscapes(data)
+		}
+	}
+
+	// If nothing was decoded, return original string
+	if len(result) == 0 || bytes.Equal(result, data) {
+		return encodedValue
+	}
+
+	return string(result)
+}
+
+// decodeMultiCodePoint decodes a continuous sequence of Unicode code points (U+XXXX format)
+func decodeMultiCodePoint(sequence string) string {
+	// If the sequence is empty, return empty string
+	if sequence == "" {
+		return ""
+	}
+
+	// Split the sequence by whitespace to get individual code points
+	codePoints := strings.Fields(sequence)
+	if len(codePoints) == 0 {
+		return sequence
+	}
+
+	// Decode each code point and build the result
+	var decodedRunes []rune
+	for _, cp := range codePoints {
+		// Check if it follows the U+XXXX pattern
+		if !strings.HasPrefix(cp, "U+") || len(cp) < 6 {
+			continue
+		}
+
+		// Extract the hexadecimal value
+		hexValue := cp[2:]
+
+		// Parse the hexadecimal value to an integer
+		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
+		if err != nil {
+			continue
+		}
+
+		// Convert to rune and add to result
+		decodedRunes = append(decodedRunes, rune(unicodeInt))
+	}
+
+	// If we didn't decode anything, return the original sequence
+	if len(decodedRunes) == 0 {
+		return sequence
+	}
+
+	// Return the decoded string
+	return string(decodedRunes)
+}
+
+// decodeMultiEscape decodes a continuous sequence of Unicode escape sequences (\uXXXX format)
+func decodeMultiEscape(sequence string) string {
+	// If the sequence is empty, return empty string
+	if sequence == "" {
+		return ""
+	}
+
+	// Find all escape sequences
+	escapes := unicodeEscapePat.FindAllStringSubmatch(sequence, -1)
+	if len(escapes) == 0 {
+		return sequence
+	}
+
+	// Decode each escape sequence and build the result
+	var decodedRunes []rune
+	for _, esc := range escapes {
+		// Extract the hexadecimal value
+		hexValue := esc[1]
+
+		// Parse the hexadecimal value to an integer
+		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
+		if err != nil {
+			continue
+		}
+
+		// Convert to rune and add to result
+		decodedRunes = append(decodedRunes, rune(unicodeInt))
+	}
+
+	// If we didn't decode anything, return the original sequence
+	if len(decodedRunes) == 0 {
+		return sequence
+	}
+
+	// Return the decoded string
+	return string(decodedRunes)
+}
+
+// decodeIndividualCodePoints decodes individual Unicode code points (U+1234 format)
+// This is a fallback for when we don't have a continuous sequence of code points
+func decodeIndividualCodePoints(input []byte) []byte {
+	// Find all Unicode code point sequences in the input byte slice
+	indices := unicodeCodePointPat.FindAllSubmatchIndex(input, -1)
+
+	// If none found, return original input
+	if len(indices) == 0 {
+		return input
+	}
+
+	// Iterate over found indices in reverse order to avoid modifying the slice length
+	utf8Bytes := make([]byte, maxBytesPerRune)
+	for i := len(indices) - 1; i >= 0; i-- {
+		matches := indices[i]
+
+		startIndex := matches[0]
+		endIndex := matches[1]
+		hexStartIndex := matches[2]
+		hexEndIndex := matches[3]
+
+		// If the input is like `U+1234 U+5678` we should replace `U+1234 `.
+		// Otherwise, we should only replace `U+1234`.
+		if endIndex != hexEndIndex && endIndex < len(input) && input[endIndex-1] == ' ' {
+			endIndex = endIndex - 1
+		}
+
+		// Extract the hexadecimal value from the escape sequence
+		hexValue := string(input[hexStartIndex:hexEndIndex])
+
+		// Parse the hexadecimal value to an integer
+		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
+		if err != nil {
+			// If there's an error, continue to the next escape sequence
+			continue
+		}
+
+		// Convert the Unicode code point to a UTF-8 representation
+		utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
+
+		// Replace the escape sequence with the UTF-8 representation
+		input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
+	}
+
+	return input
+}
+
+// decodeIndividualEscapes decodes individual Unicode escape sequences (\u1234 format)
+// This is a fallback for when we don't have a continuous sequence of escape sequences
+func decodeIndividualEscapes(input []byte) []byte {
+	// Find all Unicode escape sequences in the input byte slice
+	indices := unicodeEscapePat.FindAllSubmatchIndex(input, -1)
+
+	// If none found, return original input
+	if len(indices) == 0 {
+		return input
+	}
+
+	// Iterate over found indices in reverse order to avoid modifying the slice length
+	utf8Bytes := make([]byte, maxBytesPerRune)
+	for i := len(indices) - 1; i >= 0; i-- {
+		matches := indices[i]
+
+		startIndex := matches[0]
+		hexStartIndex := matches[2]
+		endIndex := matches[3]
+
+		// Extract the hexadecimal value from the escape sequence
+		hexValue := string(input[hexStartIndex:endIndex])
+
+		// Parse the hexadecimal value to an integer
+		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
+		if err != nil {
+			// If there's an error, continue to the next escape sequence
+			continue
+		}
+
+		// Convert the Unicode code point to a UTF-8 representation
+		utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
+
+		// Replace the escape sequence with the UTF-8 representation
+		input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
+	}
+
+	return input
+}