Skip to content

Commit

Permalink
feat: repl invalid utf8 in serde by option (#357)
Browse files Browse the repository at this point in the history
  • Loading branch information
liuq19 authored Feb 10, 2023
1 parent f87d87d commit 02865de
Show file tree
Hide file tree
Showing 52 changed files with 23,318 additions and 20,629 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/license-check.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: License Check

on: push
on: pull_request

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push-check-go118.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Push Check Go1.18-Linux-X64

on: push
on: pull_request

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push-check-linux-arm64.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Push Check Linux-ARM

on: push
on: pull_request

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push-check-linux-x64.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Push Check Linux-X64

on: push
on: pull_request

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push-check-qemu.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Push Check Linux-Qemu

on: push
on: pull_request

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push-check-windows.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Push Check Windows-X64

on: push
on: pull_request

jobs:
build:
Expand Down
8 changes: 5 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ CC_amd64 := clang
ASM2ASM_amd64 := tools/asm2asm/asm2asm.py

CFLAGS := -mno-red-zone
CFLAGS += -arch x86_64
CFLAGS += -target x86_64-apple-macos11
CFLAGS += -fno-asynchronous-unwind-tables
CFLAGS += -fno-builtin
CFLAGS += -fno-exceptions
Expand Down Expand Up @@ -100,8 +100,10 @@ endef
all: ${ARCH}

clean:
rm -vfr ${TMP_DIR}/{sse,avx,avx2}
rm -vfr ${OUT_DIR}/{sse,avx,avx2}
for arch in ${ARCH}; do \
rm -vfr ${TMP_DIR}/$${arch}; \
rm -vfr ${OUT_DIR}/$${arch}; \
done

$(foreach \
arch, \
Expand Down
10 changes: 7 additions & 3 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ type Config struct {
// CopyString indicates decoder to decode string values by copying instead of referring.
CopyString bool

// ValidateString indicates decoder to valid string values: decoder will return errors when
// invalid UTF-8 chars or unescaped control chars(\u0000-\u001f) in the string value of JSON.
// ValidateString indicates decoder and encoder to valid string values: decoder will return errors
// when unescaped control chars(\u0000-\u001f) in the string value of JSON.
ValidateString bool
}

Expand All @@ -81,6 +81,7 @@ var (
SortMapKeys: true,
CompactMarshaler: true,
CopyString : true,
ValidateString : true,
}.Froze()

// ConfigFastest is the fastest config of APIs, aiming at speed.
Expand Down Expand Up @@ -164,12 +165,15 @@ func UnmarshalString(buf string, val interface{}) error {
return ConfigDefault.UnmarshalFromString(buf, val)
}

// Get searches the given path json,
// Get searches the given path from json,
// and returns its representing ast.Node.
//
// Each path arg must be integer or string:
// - Integer means searching current node as array
// - String means searching current node as object
//
// Note, the api expects the json is well-formed at least,
// otherwise it may return unexpected result.
func Get(src []byte, path ...interface{}) (ast.Node, error) {
return GetFromString(string(src), path...)
}
Expand Down
5 changes: 4 additions & 1 deletion ast/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,10 @@ func (self *Node) AddAny(val interface{}) error {
}

// GetByPath load given path on demands,
// which only ensure nodes before this path got parsed
// which only ensure nodes before this path got parsed.
//
// Note, the api expects the json is well-formed at least,
// otherwise it may return unexpected result.
func (self *Node) GetByPath(path ...interface{}) *Node {
if !self.Valid() {
return self
Expand Down
76 changes: 69 additions & 7 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ import (
`strings`
`testing`
`time`
`unicode/utf8`
`unsafe`

`github.com/bytedance/sonic/decoder`
Expand Down Expand Up @@ -1011,8 +1010,8 @@ var unmarshalTests = []unmarshalTest{

{in: "\"\x00\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\x00\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\xff\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\xff\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\ufffd"), validateString: true},
{in: "\"\xff\"", ptr: new(string), out: "\ufffd", validateString: true},
{in: "\"\x00\"", ptr: new(interface{}), out: interface{}("\x00"), validateString: false},
{in: "\"\x00\"", ptr: new(string), out: "\x00", validateString: false},
{in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\xff"), validateString: false},
Expand Down Expand Up @@ -1147,7 +1146,6 @@ func TestUnmarshal(t *testing.T) {
}

dec := decoder.NewDecoder(tt.in)
validUtf8 := true
if tt.useNumber {
dec.UseNumber()
}
Expand All @@ -1156,10 +1154,9 @@ func TestUnmarshal(t *testing.T) {
}
if tt.validateString {
dec.ValidateString()
validUtf8 = utf8.Valid([]byte(tt.in))
}
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil && validUtf8) {
spew.Dump(tt.in)
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil) {
spew.Dump(tt)
t.Fatalf("#%d: %v, want %v", i, err, tt.err)
continue
} else if err != nil {
Expand Down Expand Up @@ -2524,3 +2521,68 @@ func TestChangeTool(t *testing.T) {
}

}

func TestDecoder_LongestInvalidUtf8(t *testing.T) {
for _, data := range([]string{
"\"" + strings.Repeat("\x80", 4096) + "\"",
"\"" + strings.Repeat("\x80", 4095) + "\"",
"\"" + strings.Repeat("\x80", 4097) + "\"",
"\"" + strings.Repeat("\x80", 12345) + "\"",
}) {
testDecodeInvalidUtf8(t, []byte(data))
}
}

func testDecodeInvalidUtf8(t *testing.T, data []byte) {
var sgot, jgot string
serr := ConfigStd.Unmarshal(data, &sgot)
jerr := json.Unmarshal(data, &jgot)
assert.Equal(t, serr != nil, jerr != nil)
if jerr == nil {
assert.Equal(t, sgot, jgot)
}
}

func needEscape(b byte) bool {
return b == '"' || b == '\\' || b < '\x20'
}

func genRandJsonBytes(length int) []byte {
var buf bytes.Buffer
buf.WriteByte('"')
for j := 0; j < length; j++ {
r := rand.Intn(0xff + 1)
if needEscape(byte(r)) {
buf.WriteByte('\\')
}
buf.WriteByte(byte(r))
}
buf.WriteByte('"')
return buf.Bytes()
}

func genRandJsonRune(length int) []byte {
var buf bytes.Buffer
buf.WriteByte('"')
for j := 0; j < length; j++ {
r := rand.Intn(0x10FFFF + 1)
if r < 0x80 && needEscape(byte(r)) {
buf.WriteByte('\\')
buf.WriteByte(byte(r))
} else {
buf.WriteRune(rune(r))
}
}
buf.WriteByte('"')
return buf.Bytes()
}

func TestDecoder_RandomInvalidUtf8(t *testing.T) {
nums := 1000
maxLen := 1000
for i := 0; i < nums; i++ {
length := rand.Intn(maxLen)
testDecodeInvalidUtf8(t, genRandJsonBytes(length))
testDecodeInvalidUtf8(t, genRandJsonRune(length))
}
}
32 changes: 31 additions & 1 deletion decoder/decoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
`github.com/bytedance/sonic/internal/native/types`
`github.com/bytedance/sonic/internal/rt`
`github.com/bytedance/sonic/option`
`github.com/bytedance/sonic/utf8`
)

const (
Expand Down Expand Up @@ -80,9 +81,39 @@ func (self *Decoder) Reset(s string) {
// self.f = 0
}

func (self *Decoder) CheckTrailings() error {
pos := self.i
buf := self.s
/* skip all the trailing spaces */
if pos != len(buf) {
for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 {
pos++
}
}

/* then it must be at EOF */
if pos == len(buf) {
return nil
}

/* junk after JSON value */
return SyntaxError {
Src : buf,
Pos : pos,
Code : types.ERR_INVALID_CHAR,
}
}


// Decode parses the JSON-encoded data from current position and stores the result
// in the value pointed to by val.
func (self *Decoder) Decode(val interface{}) error {
/* validate json if needed */
if (self.f & (1 << _F_validate_string)) != 0 && !utf8.ValidateString(self.s){
dbuf := utf8.CorrectWith(nil, rt.Str2Mem(self.s), "\ufffd")
self.s = rt.Mem2Str(dbuf)
}

vv := rt.UnpackEface(val)
vp := vv.Value

Expand All @@ -99,7 +130,6 @@ func (self *Decoder) Decode(val interface{}) error {
/* create a new stack, and call the decoder */
sb, etp := newStack(), rt.PtrElem(vv.Type)
nb, err := decodeTypedPointer(self.s, self.i, etp, vp, sb, self.f)

/* return the stack back */
self.i = nb
freeStack(sb)
Expand Down
1 change: 0 additions & 1 deletion decoder/decoder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,6 @@ func TestDecoder_Generic(t *testing.T) {
pos, err := decode(TwitterJson, &v, false)
assert.NoError(t, err)
assert.Equal(t, len(TwitterJson), pos)
spew.Dump(v)
}

func TestDecoder_Binding(t *testing.T) {
Expand Down
31 changes: 30 additions & 1 deletion encode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
`testing`
`time`
`unsafe`
`strings`

`github.com/bytedance/sonic/encoder`
`github.com/stretchr/testify/assert`
Expand All @@ -52,7 +53,6 @@ func TestMain(m *testing.M) {
runtime.GC()
debug.FreeOSMemory()
}
println("stop GC looping!")
}()
time.Sleep(time.Millisecond)
m.Run()
Expand Down Expand Up @@ -1168,4 +1168,33 @@ func TestMarshalNullNil(t *testing.T) {
}.Froze().Marshal(v)
assert.Nil(t, e)
assert.Equal(t, `{"A":[],"B":{}}`, string(o))
}

func TestEncoder_LongestInvalidUtf8(t *testing.T) {
for _, data := range([]string{
"\"" + strings.Repeat("\x80", 4096) + "\"",
"\"" + strings.Repeat("\x80", 4095) + "\"",
"\"" + strings.Repeat("\x80", 4097) + "\"",
"\"" + strings.Repeat("\x80", 12345) + "\"",
}) {
testEncodeInvalidUtf8(t, []byte(data))
}
}

func testEncodeInvalidUtf8(t *testing.T, data []byte) {
jgot, jerr := json.Marshal(data)
sgot, serr := ConfigStd.Marshal(data)
assert.Equal(t, serr != nil, jerr != nil)
if jerr == nil {
assert.Equal(t, sgot, jgot)
}
}

func TestEncoder_RandomInvalidUtf8(t *testing.T) {
nums := 1000
maxLen := 1000
for i := 0; i < nums; i++ {
testEncodeInvalidUtf8(t, genRandJsonBytes(maxLen))
testEncodeInvalidUtf8(t, genRandJsonRune(maxLen))
}
}
Loading

0 comments on commit 02865de

Please sign in to comment.