fix: JSON block string encoding (#843)

Fixes #839 Note, I considered using `gjson.AppendJSONString` for better performance, but it currently doesn't have an option to disable escaping of HTML characters - which I think would be undesirable here. See tidwall/gjson#362 --------- Co-authored-by: Sergiy 🇺🇦 <818351+devsergiy@users.noreply.github.com>
wundergraph · Jul 15, 2024 · 699eb81 · 699eb81
1 parent 16602c6
commit 699eb81
Show file tree

Hide file tree

Showing 8 changed files with 390 additions and 16 deletions.
diff --git a/pkg/ast/ast_val_string_value.go b/pkg/ast/ast_val_string_value.go
@@ -37,6 +37,100 @@ func (d *Document) StringValueIsBlockString(ref int) bool {
 	return d.StringValues[ref].BlockString
 }
 
+func (d *Document) BlockStringValueContentRawBytes(ref int) []byte {
+
+	// Gets the full block string content, just inside the """ quotes.
+	// This is needed because the lexer ignores whitespace and we need to preserve it
+	// to account for the indentation of the block string.
+
+	blockStart := 0
+	for i := int(d.StringValues[ref].Content.Start) - 1; i >= 0; i-- {
+		if d.Input.RawBytes[i] == '"' {
+			blockStart = i + 1
+			break
+		}
+	}
+
+	blockEnd := d.Input.Length
+	for i := int(d.StringValues[ref].Content.End); i < d.Input.Length; i++ {
+		if d.Input.RawBytes[i] == '"' {
+			blockEnd = i
+			break
+		}
+	}
+
+	return d.Input.RawBytes[blockStart:blockEnd]
+}
+
+func (d *Document) BlockStringValueContentRawString(ref int) string {
+	return unsafebytes.BytesToString(d.BlockStringValueContentRawBytes(ref))
+}
+
+func (d *Document) BlockStringValueContentBytes(ref int) []byte {
+
+	// Implements https://spec.graphql.org/October2021/#BlockStringValue()
+
+	// NOTE: This implementation exactly follows the spec.
+	// It likely could be optimized for performance.
+
+	// split the raw value into lines
+	rawValue := d.BlockStringValueContentRawBytes(ref)
+	lines := splitBytesIntoLines(rawValue)
+
+	// find the common indent size (-1 means no common indent)
+	commonIndent := -1
+	for i, line := range lines {
+		if i == 0 {
+			continue
+		}
+		indent := leadingWhitespaceCount(line)
+		if indent < len(line) {
+			if commonIndent == -1 || indent < commonIndent {
+				commonIndent = indent
+			}
+		}
+	}
+
+	// remove the common indent from each line
+	if commonIndent != -1 {
+		for i := 1; i < len(lines); i++ {
+			var indent int
+			if len(lines[i]) > commonIndent {
+				indent = commonIndent
+			} else {
+				indent = len(lines[i])
+			}
+
+			lines[i] = lines[i][indent:]
+		}
+	}
+
+	// find first non-whitespace-only line
+	firstLine := 0
+	for i, line := range lines {
+		if leadingWhitespaceCount(line) != len(line) {
+			firstLine = i
+			break
+		}
+	}
+
+	// find last non-whitespace-only line
+	lastLine := len(lines) - 1
+	for i := len(lines) - 1; i >= 0; i-- {
+		if leadingWhitespaceCount(lines[i]) != len(lines[i]) {
+			lastLine = i
+			break
+		}
+	}
+
+	// join the lines to keep and return the result
+	return bytes.Join(lines[firstLine:lastLine+1], []byte{'\n'})
+}
+
+func (d *Document) BlockStringValueContentString(ref int) string {
+	return unsafebytes.BytesToString(d.BlockStringValueContentBytes(ref))
+}
+
 func (d *Document) StringValuesAreEquals(left, right int) bool {
 	return d.StringValueIsBlockString(left) == d.StringValueIsBlockString(right) &&
 		bytes.Equal(d.StringValueContentBytes(left), d.StringValueContentBytes(right))

diff --git a/pkg/ast/ast_value.go b/pkg/ast/ast_value.go
@@ -2,6 +2,7 @@ package ast
 
 import (
 	"bytes"
+	"encoding/json"
 	"fmt"
 	"io"
 
@@ -144,7 +145,20 @@ func (d *Document) writeJSONValue(buf *bytes.Buffer, value Value) error {
 			buf.Write(literal.TRUE)
 		}
 	case ValueKindString:
-		buf.Write(quotes.WrapBytes(d.StringValueContentBytes(value.Ref)))
+		if d.StringValueIsBlockString(value.Ref) {
+			content := d.BlockStringValueContentString(value.Ref)
+
+			enc := json.NewEncoder(buf)
+			enc.SetEscapeHTML(false)
+			if err := enc.Encode(content); err != nil {
+				return err
+			}
+
+			// Remove the extra newline that Encode adds
+			buf.Truncate(buf.Len() - 1)
+		} else {
+			buf.Write(quotes.WrapBytes(d.StringValueContentBytes(value.Ref)))
+		}
 	case ValueKindList:
 		buf.WriteByte(literal.LBRACK_BYTE)
 		for ii, ref := range d.ListValues[value.Ref].Refs {

diff --git a/pkg/ast/ast_value_test.go b/pkg/ast/ast_value_test.go
@@ -83,15 +83,53 @@ func TestDocument_ValueToJSON(t *testing.T) {
 			Ref:  1,
 		}
 	}, `true`))
-	t.Run("ValueKindString", run(func(doc *Document) Value {
+	t.Run("ValueKindString - non-block", run(func(doc *Document) Value {
 		doc.StringValues = append(doc.StringValues, StringValue{
-			Content: doc.Input.AppendInputString("foo"),
+			Content: doc.Input.AppendInputString(`foo\nbar\tbaz\"qux`),
 		})
 		return Value{
 			Kind: ValueKindString,
 			Ref:  0,
 		}
-	}, `"foo"`))
+	}, `"foo\nbar\tbaz\"qux"`))
+	t.Run("ValueKindString - block", run(func(doc *Document) Value {
+		doc.Input.AppendInputString(`"""`)
+		doc.StringValues = append(doc.StringValues, StringValue{
+			BlockString: true,
+			Content:     doc.Input.AppendInputString("foo\nbar\tbaz\"qux"),
+		})
+		doc.Input.AppendInputString(`"""`)
+		return Value{
+			Kind: ValueKindString,
+			Ref:  0,
+		}
+	}, `"foo\nbar\tbaz\"qux"`))
+	t.Run("ValueKindString - block with indent", run(func(doc *Document) Value {
+		doc.Input.AppendInputString(`"""`)
+		doc.Input.AppendInputString("\n")
+		doc.StringValues = append(doc.StringValues, StringValue{
+			BlockString: true,
+			Content:     doc.Input.AppendInputString("  foo\n  bar"),
+		})
+		doc.Input.AppendInputString("\n")
+		doc.Input.AppendInputString(`"""`)
+		return Value{
+			Kind: ValueKindString,
+			Ref:  0,
+		}
+	}, `"foo\nbar"`))
+	t.Run("ValueKindString - block with mixed indent", run(func(doc *Document) Value {
+		doc.Input.AppendInputString(`"""`)
+		doc.StringValues = append(doc.StringValues, StringValue{
+			BlockString: true,
+			Content:     doc.Input.AppendInputString("foo\n\t bar\n\t  baz"),
+		})
+		doc.Input.AppendInputString(`"""`)
+		return Value{
+			Kind: ValueKindString,
+			Ref:  0,
+		}
+	}, `"foo\nbar\n baz"`))
 	t.Run("ValueKindList", run(func(doc *Document) Value {
 		doc.StringValues = append(doc.StringValues, StringValue{
 			Content: doc.Input.AppendInputString("foo"),
@@ -187,21 +225,21 @@ func TestDocument_PrintValue(t *testing.T) {
 	}
 	t.Run("ValueKindString - non-block", run(func(doc *Document) Value {
 		doc.StringValues = append(doc.StringValues, StringValue{
-			Content: doc.Input.AppendInputString("foo"),
+			Content: doc.Input.AppendInputString(`foo\nbar\tbaz\"qux`),
 		})
 		return Value{
 			Kind: ValueKindString,
 			Ref:  0,
 		}
-	}, `"foo"`))
+	}, `"foo\nbar\tbaz\"qux"`))
 	t.Run("ValueKindString - block", run(func(doc *Document) Value {
 		doc.StringValues = append(doc.StringValues, StringValue{
 			BlockString: true,
-			Content:     doc.Input.AppendInputString("foo"),
+			Content:     doc.Input.AppendInputString("foo\nbar\tbaz\"qux"),
 		})
 		return Value{
 			Kind: ValueKindString,
 			Ref:  0,
 		}
-	}, `"""foo"""`))
+	}, "\"\"\"foo\nbar\tbaz\"qux\"\"\""))
 }
diff --git a/pkg/ast/helpers.go b/pkg/ast/helpers.go
@@ -15,3 +15,44 @@ func indexOf(refs []int, ref int) (int, bool) {
 func deleteRef(refs *[]int, index int) {
 	*refs = append((*refs)[:index], (*refs)[index+1:]...)
 }
+
+// Splits byte slices into lines based on line terminators (\n, \r, \r\n)
+// defined by https://spec.graphql.org/October2021/#sec-Line-Terminators
+func splitBytesIntoLines(data []byte) [][]byte {
+	var lines [][]byte
+	start := 0
+	length := len(data)
+
+	for i := 0; i < length; i++ {
+		switch c := data[i]; c {
+		case '\n', '\r':
+			if start <= i {
+				lines = append(lines, data[start:i])
+			}
+
+			if c == '\r' && i+1 < length && data[i+1] == '\n' {
+				i++
+			}
+
+			start = i + 1
+		}
+	}
+
+	if start <= length {
+		lines = append(lines, data[start:])
+	}
+
+	return lines
+}
+
+// counts leading whitespace characters (spaces or tabs) in a byte slice
+func leadingWhitespaceCount(line []byte) int {
+	count := 0
+	for _, c := range line {
+		if c != ' ' && c != '\t' {
+			break
+		}
+		count++
+	}
+	return count
+}
diff --git a/v2/pkg/ast/ast_val_string_value.go b/v2/pkg/ast/ast_val_string_value.go
@@ -37,6 +37,100 @@ func (d *Document) StringValueIsBlockString(ref int) bool {
 	return d.StringValues[ref].BlockString
 }
 
+func (d *Document) BlockStringValueContentRawBytes(ref int) []byte {
+
+	// Gets the full block string content, just inside the """ quotes.
+	// This is needed because the lexer ignores whitespace and we need to preserve it
+	// to account for the indentation of the block string.
+
+	blockStart := 0
+	for i := int(d.StringValues[ref].Content.Start) - 1; i >= 0; i-- {
+		if d.Input.RawBytes[i] == '"' {
+			blockStart = i + 1
+			break
+		}
+	}
+
+	blockEnd := d.Input.Length
+	for i := int(d.StringValues[ref].Content.End); i < d.Input.Length; i++ {
+		if d.Input.RawBytes[i] == '"' {
+			blockEnd = i
+			break
+		}
+	}
+
+	return d.Input.RawBytes[blockStart:blockEnd]
+}
+
+func (d *Document) BlockStringValueContentRawString(ref int) string {
+	return unsafebytes.BytesToString(d.BlockStringValueContentRawBytes(ref))
+}
+
+func (d *Document) BlockStringValueContentBytes(ref int) []byte {
+
+	// Implements https://spec.graphql.org/October2021/#BlockStringValue()
+
+	// NOTE: This implementation exactly follows the spec.
+	// It likely could be optimized for performance.
+
+	// split the raw value into lines
+	rawValue := d.BlockStringValueContentRawBytes(ref)
+	lines := splitBytesIntoLines(rawValue)
+
+	// find the common indent size (-1 means no common indent)
+	commonIndent := -1
+	for i, line := range lines {
+		if i == 0 {
+			continue
+		}
+		indent := leadingWhitespaceCount(line)
+		if indent < len(line) {
+			if commonIndent == -1 || indent < commonIndent {
+				commonIndent = indent
+			}
+		}
+	}
+
+	// remove the common indent from each line
+	if commonIndent != -1 {
+		for i := 1; i < len(lines); i++ {
+			var indent int
+			if len(lines[i]) > commonIndent {
+				indent = commonIndent
+			} else {
+				indent = len(lines[i])
+			}
+
+			lines[i] = lines[i][indent:]
+		}
+	}
+
+	// find first non-whitespace-only line
+	firstLine := 0
+	for i, line := range lines {
+		if leadingWhitespaceCount(line) != len(line) {
+			firstLine = i
+			break
+		}
+	}
+
+	// find last non-whitespace-only line
+	lastLine := len(lines) - 1
+	for i := len(lines) - 1; i >= 0; i-- {
+		if leadingWhitespaceCount(lines[i]) != len(lines[i]) {
+			lastLine = i
+			break
+		}
+	}
+
+	// join the lines to keep and return the result
+	return bytes.Join(lines[firstLine:lastLine+1], []byte{'\n'})
+}
+
+func (d *Document) BlockStringValueContentString(ref int) string {
+	return unsafebytes.BytesToString(d.BlockStringValueContentBytes(ref))
+}
+
 func (d *Document) StringValuesAreEquals(left, right int) bool {
 	return d.StringValueIsBlockString(left) == d.StringValueIsBlockString(right) &&
 		bytes.Equal(d.StringValueContentBytes(left), d.StringValueContentBytes(right))

diff --git a/v2/pkg/ast/ast_value.go b/v2/pkg/ast/ast_value.go
@@ -2,6 +2,7 @@ package ast
 
 import (
 	"bytes"
+	"encoding/json"
 	"fmt"
 	"io"
 
@@ -144,7 +145,20 @@ func (d *Document) writeJSONValue(buf *bytes.Buffer, value Value) error {
 			buf.Write(literal.TRUE)
 		}
 	case ValueKindString:
-		buf.Write(quotes.WrapBytes(d.StringValueContentBytes(value.Ref)))
+		if d.StringValueIsBlockString(value.Ref) {
+			content := d.BlockStringValueContentString(value.Ref)
+
+			enc := json.NewEncoder(buf)
+			enc.SetEscapeHTML(false)
+			if err := enc.Encode(content); err != nil {
+				return err
+			}
+
+			// Remove the extra newline that Encode adds
+			buf.Truncate(buf.Len() - 1)
+		} else {
+			buf.Write(quotes.WrapBytes(d.StringValueContentBytes(value.Ref)))
+		}
 	case ValueKindList:
 		buf.WriteByte(literal.LBRACK_BYTE)
 		for ii, ref := range d.ListValues[value.Ref].Refs {