Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: sanitize messageID from \u0000 and irregular utf8 runes #4063

Merged
merged 8 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions gateway/gateway_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,77 @@ var _ = Describe("Gateway", func() {
Expect(err).To(BeNil())
})

It("sanitizes messageID, trim space and replace with new uuid", func() {
// passing a messageID full of invisible characters
payloadMap := map[string]interface{}{
"batch": []interface{}{
map[string]interface{}{
"type": "track",
"userId": map[string]interface{}{"id": 456},
"messageId": " \u0000\u00A0\t\n\r\u034F ",
},
},
}
payload, err := json.Marshal(payloadMap)
Expect(err).To(BeNil())
req := &webRequestT{
reqType: "batch",
authContext: rCtxEnabled,
done: make(chan<- string),
userIDHeader: userIDHeader,
requestPayload: payload,
}
jobForm, err := gateway.getJobDataFromRequest(req)
Expect(err).To(BeNil())

var job struct {
Batch []struct {
MessageID string `json:"messageID"`
} `json:"batch"`
}

err = json.Unmarshal(jobForm.jobs[0].EventPayload, &job)
Expect(err).To(BeNil())

u, err := uuid.Parse(job.Batch[0].MessageID)
Expect(err).To(BeNil())
Expect(u.Version()).To(Equal(uuid.Version(4)))
})

It("sanitizes messageID, remove bad runes and trim space", func() {
// passing a messageID full of invisible characters
payloadMap := map[string]interface{}{
"batch": []interface{}{
map[string]interface{}{
"type": "track",
"userId": map[string]interface{}{"id": 456},
"messageId": "\u0000 -a-random-string \u00A0\t\n\r\u034F",
},
},
}
payload, err := json.Marshal(payloadMap)
Expect(err).To(BeNil())
req := &webRequestT{
reqType: "batch",
authContext: rCtxEnabled,
done: make(chan<- string),
userIDHeader: userIDHeader,
requestPayload: payload,
}
jobForm, err := gateway.getJobDataFromRequest(req)
Expect(err).To(BeNil())

var job struct {
Batch []struct {
MessageID string `json:"messageID"`
} `json:"batch"`
}

err = json.Unmarshal(jobForm.jobs[0].EventPayload, &job)
Expect(err).To(BeNil())
Expect(job.Batch[0].MessageID).To(Equal("-a-random-string"))
})

It("allows extract events even if userID and anonID are not present in the request payload", func() {
req := &webRequestT{
reqType: "batch",
Expand Down
5 changes: 4 additions & 1 deletion gateway/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,11 @@ func (gw *Handle) getJobDataFromRequest(req *webRequestT) (jobData *jobFromReq,

fillMessageID := func(event map[string]interface{}) {
messageID, _ := event["messageId"].(string)
if strings.TrimSpace(messageID) == "" {
messageID = strings.TrimSpace(misc.SanitizeUnicode(messageID))
if messageID == "" {
event["messageId"] = uuid.New().String()
} else {
event["messageId"] = messageID
}
}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ require (
golang.org/x/exp v0.0.0-20230905200255-921286631fa9
golang.org/x/oauth2 v0.13.0
golang.org/x/sync v0.4.0
golang.org/x/text v0.13.0
google.golang.org/api v0.148.0
google.golang.org/genproto/googleapis/rpc v0.0.0-20231012201019-e917dd12ba7a
google.golang.org/grpc v1.58.3
Expand Down Expand Up @@ -290,7 +291,6 @@ require (
golang.org/x/net v0.17.0 // indirect
golang.org/x/sys v0.13.0 // indirect
golang.org/x/term v0.13.0 // indirect
golang.org/x/text v0.13.0 // indirect
golang.org/x/tools v0.13.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
google.golang.org/appengine v1.6.7 // indirect
Expand Down
85 changes: 85 additions & 0 deletions utils/misc/unicode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package misc

import (
"strings"
"unicode"

"golang.org/x/text/unicode/rangetable"
)

// unicode.IsPrint does not include all invisible characters,
// so I got this list from https://invisible-characters.com/
var InvisibleRunes = []rune{
'\u0000', // NULL
'\u0009', // CHARACTER TABULATION
// '\u0020', // SPACE <- this is not trimmed
'\u00A0', // NO-BREAK SPACE
'\u00AD', // SOFT HYPHEN
'\u034F', // COMBINING GRAPHEME JOINER
'\u061C', // ARABIC LETTER MARK
'\u115F', // HANGUL CHOSEONG FILLER
'\u1160', // HANGUL JUNGSEONG FILLER
'\u17B4', // KHMER VOWEL INHERENT AQ
'\u17B5', // KHMER VOWEL INHERENT AA
'\u180E', // MONGOLIAN VOWEL SEPARATOR
'\u2000', // EN QUAD
'\u2001', // EM QUAD
'\u2002', // EN SPACE
'\u2003', // EM SPACE
'\u2004', // THREE-PER-EM SPACE
'\u2005', // FOUR-PER-EM SPACE
'\u2006', // SIX-PER-EM SPACE
'\u2007', // FIGURE SPACE
'\u2008', // PUNCTUATION SPACE
'\u2009', // THIN SPACE
'\u200A', // HAIR SPACE
'\u200B', // ZERO WIDTH SPACE
'\u200C', // ZERO WIDTH NON-JOINER
'\u200D', // ZERO WIDTH JOINER
'\u200E', // LEFT-TO-RIGHT MARK
'\u200F', // RIGHT-TO-LEFT MARK
'\u202F', // NARROW NO-BREAK SPACE
'\u205F', // MEDIUM MATHEMATICAL SPACE
'\u2060', // WORD JOINER
'\u2061', // FUNCTION APPLICATION
'\u2062', // INVISIBLE TIMES
'\u2063', // INVISIBLE SEPARATOR
'\u2064', // INVISIBLE PLUS
'\u206A', // INHIBIT SYMMETRIC SWAPPING
'\u206B', // ACTIVATE SYMMETRIC SWAPPING
'\u206C', // INHIBIT ARABIC FORM SHAPING
'\u206D', // ACTIVATE ARABIC FORM SHAPING
'\u206E', // NATIONAL DIGIT SHAPES
'\u206F', // NOMINAL DIGIT SHAPES
'\u3000', // IDEOGRAPHIC SPACE
'\u2800', // BRAILLE PATTERN BLANK
'\u3164', // HANGUL FILLER
'\uFEFF', // ZERO WIDTH NO-BREAK SPACE
'\uFFA0', // HALFWIDTH HANGUL FILLER
}

var invisibleRangeTable *unicode.RangeTable

func init() {
invisibleRangeTable = rangetable.New(InvisibleRunes...)
}

// SanitizeUnicode removes irregularly invisible characters from a string.
//
// Non-printable characters as defined in Go's unicode package (unicode.IsPrint),
// plus characters in the InvisibleRunes list (https://invisible-characters.com/).
//
// Note: regular ascii space (0x20) is not removed.
func SanitizeUnicode(str string) string {
return strings.Map(func(r rune) rune {
if unicode.Is(invisibleRangeTable, r) {
return -1
}

if !unicode.IsPrint(r) {
return -1
}

return r
}, str)
}
89 changes: 89 additions & 0 deletions utils/misc/unicode_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package misc_test

import (
"fmt"
"testing"
"unicode"

"github.com/stretchr/testify/require"
"golang.org/x/text/unicode/rangetable"

"github.com/rudderlabs/rudder-server/utils/misc"
)

var out string

func BenchmarkMessageID(b *testing.B) {
dirtyMessageID := "\u0000 Test foo_bar-baz \u034F 123-222 "
properMessageID := "123e4567-e89b-12d3-a456-426614174000"

b.Run("in-place for loop - dirty", func(b *testing.B) {
for i := 0; i < b.N; i++ {
out = sanitizeMessageIDForLoop(dirtyMessageID)
}
})

b.Run("in-place for loop - proper", func(b *testing.B) {
for i := 0; i < b.N; i++ {
out = sanitizeMessageIDForLoop(properMessageID)
}
})

b.Run("strings map - dirty", func(b *testing.B) {
for i := 0; i < b.N; i++ {
out = misc.SanitizeUnicode(dirtyMessageID)
}
})

b.Run("strings map - proper", func(b *testing.B) {
for i := 0; i < b.N; i++ {
out = misc.SanitizeUnicode(properMessageID)
}
})
}

var invisibleRangeTable *unicode.RangeTable

func init() {
invisibleRangeTable = rangetable.New(misc.InvisibleRunes...)
}

// incorrect implementation of sanitizeMessageID, but used for benchmarking
func sanitizeMessageIDForLoop(messageID string) string {
for i, r := range messageID {
if unicode.IsPrint(r) {
continue
}
if !unicode.Is(invisibleRangeTable, r) {
continue
}

messageID = messageID[:i] + messageID[i+1:]
}
return messageID
}

func TestSanitizeMessageID(t *testing.T) {
testcases := []struct {
in string
out string
}{
{"\u0000 Test \u0000foo_bar-baz 123-222 \u0000", " Test foo_bar-baz 123-222 "},
{"\u0000", ""},
{"\u0000 ", " "},
{"\u0000 \u0000", " "},
{"\u00A0\t\n\r\u034F", ""},
{"τυχαίο;", "τυχαίο;"},
}

for _, tc := range testcases {
cleanMessageID := misc.SanitizeUnicode(tc.in)
require.Equal(t, tc.out, cleanMessageID, fmt.Sprintf("%#v -> %#v", tc.in, tc.out))
}

for _, r := range misc.InvisibleRunes {
cleanMessageID := misc.SanitizeUnicode(string(r))
require.Empty(t, cleanMessageID, fmt.Sprintf("%U", r))

}
}
Loading