From 5c94d8602907f9f76fed2bf0130059f9518c4f01 Mon Sep 17 00:00:00 2001 From: Cameron Moore Date: Sat, 12 Sep 2020 11:01:32 -0500 Subject: [PATCH] Use strings.Builder in lexer (#438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace all string building operations in the lexer with strings.Builder. Doing so shows significant performance improvements. BurntSushi still has a slight edge in CPU performance, but there's still much work to do on memory performance. name old time/op new time/op delta ParseToml-2 311µs ± 0% 273µs ± 3% -12.29% (p=0.008 n=5+5) UnmarshalToml-2 386µs ± 4% 349µs ± 3% -9.63% (p=0.008 n=5+5) UnmarshalBurntSushiToml-2 368µs ± 8% 341µs ± 2% ~ (p=0.056 n=5+5) name old alloc/op new alloc/op delta ParseToml-2 132kB ± 0% 118kB ± 0% -11.07% (p=0.008 n=5+5) UnmarshalToml-2 147kB ± 0% 133kB ± 0% -9.92% (p=0.008 n=5+5) UnmarshalBurntSushiToml-2 82.6kB ± 0% 82.6kB ± 0% ~ (p=1.000 n=5+5) name old allocs/op new allocs/op delta ParseToml-2 3.19k ± 0% 1.91k ± 0% -40.19% (p=0.008 n=5+5) UnmarshalToml-2 4.03k ± 0% 2.75k ± 0% -31.83% (p=0.008 n=5+5) UnmarshalBurntSushiToml-2 1.73k ± 0% 1.73k ± 0% ~ (all equal) Out of curiosity, I benchmarked the results of updating each function along the way to see how each change effected the overall performance: name \ time/op master lexKey lexLitStringAsString lexStringAsString ParseToml-2 311µs ± 0% 299µs ± 1% 290µs ± 3% 273µs ± 3% UnmarshalToml-2 386µs ± 4% 381µs ± 2% 364µs ± 2% 349µs ± 3% UnmarshalBurntSushiToml-2 368µs ± 8% 341µs ± 2% 345µs ± 5% 341µs ± 2% name \ alloc/op master lexKey lexLitStringAsString lexStringAsString ParseToml-2 132kB ± 0% 132kB ± 0% 125kB ± 0% 118kB ± 0% UnmarshalToml-2 147kB ± 0% 146kB ± 0% 140kB ± 0% 133kB ± 0% UnmarshalBurntSushiToml-2 82.6kB ± 0% 82.6kB ± 0% 82.6kB ± 0% 82.6kB ± 0% name \ allocs/op master lexKey lexLitStringAsString lexStringAsString ParseToml-2 3.19k ± 0% 2.86k ± 0% 2.49k ± 0% 1.91k ± 0% UnmarshalToml-2 4.03k ± 0% 3.70k ± 0% 3.33k ± 0% 2.75k ± 0% UnmarshalBurntSushiToml-2 1.73k ± 0% 1.73k ± 0% 1.73k ± 0% 1.73k ± 0% Benchmarks were run from the benchmark/ directory using: go test -bench=.*Toml -benchmem -count=5 ./... --- lexer.go | 100 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/lexer.go b/lexer.go index 425e847a..b1886192 100644 --- a/lexer.go +++ b/lexer.go @@ -306,7 +306,7 @@ func (l *tomlLexer) lexComma() tomlLexStateFn { // Parse the key and emits its value without escape sequences. // bare keys, basic string keys and literal string keys are supported. func (l *tomlLexer) lexKey() tomlLexStateFn { - growingString := "" + var sb strings.Builder for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { if r == '"' { @@ -315,7 +315,9 @@ func (l *tomlLexer) lexKey() tomlLexStateFn { if err != nil { return l.errorf(err.Error()) } - growingString += "\"" + str + "\"" + sb.WriteString("\"") + sb.WriteString(str) + sb.WriteString("\"") l.next() continue } else if r == '\'' { @@ -324,41 +326,45 @@ func (l *tomlLexer) lexKey() tomlLexStateFn { if err != nil { return l.errorf(err.Error()) } - growingString += "'" + str + "'" + sb.WriteString("'") + sb.WriteString(str) + sb.WriteString("'") l.next() continue } else if r == '\n' { return l.errorf("keys cannot contain new lines") } else if isSpace(r) { - str := " " + var str strings.Builder + str.WriteString(" ") + // skip trailing whitespace l.next() for r = l.peek(); isSpace(r); r = l.peek() { - str += string(r) + str.WriteRune(r) l.next() } // break loop if not a dot if r != '.' { break } - str += "." + str.WriteString(".") // skip trailing whitespace after dot l.next() for r = l.peek(); isSpace(r); r = l.peek() { - str += string(r) + str.WriteRune(r) l.next() } - growingString += str + sb.WriteString(str.String()) continue } else if r == '.' { // skip } else if !isValidBareChar(r) { return l.errorf("keys cannot contain %c character", r) } - growingString += string(r) + sb.WriteRune(r) l.next() } - l.emitWithValue(tokenKey, growingString) + l.emitWithValue(tokenKey, sb.String()) return l.lexVoid } @@ -383,7 +389,7 @@ func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { } func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { - growingString := "" + var sb strings.Builder if discardLeadingNewLine { if l.follow("\r\n") { @@ -397,14 +403,14 @@ func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNe // find end of string for { if l.follow(terminator) { - return growingString, nil + return sb.String(), nil } next := l.peek() if next == eof { break } - growingString += string(l.next()) + sb.WriteRune(l.next()) } return "", errors.New("unclosed string") @@ -438,7 +444,7 @@ func (l *tomlLexer) lexLiteralString() tomlLexStateFn { // Terminator is the substring indicating the end of the token. // The resulting string does not include the terminator. func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { - growingString := "" + var sb strings.Builder if discardLeadingNewLine { if l.follow("\r\n") { @@ -451,7 +457,7 @@ func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, for { if l.follow(terminator) { - return growingString, nil + return sb.String(), nil } if l.follow("\\") { @@ -469,61 +475,61 @@ func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, l.next() } case '"': - growingString += "\"" + sb.WriteString("\"") l.next() case 'n': - growingString += "\n" + sb.WriteString("\n") l.next() case 'b': - growingString += "\b" + sb.WriteString("\b") l.next() case 'f': - growingString += "\f" + sb.WriteString("\f") l.next() case '/': - growingString += "/" + sb.WriteString("/") l.next() case 't': - growingString += "\t" + sb.WriteString("\t") l.next() case 'r': - growingString += "\r" + sb.WriteString("\r") l.next() case '\\': - growingString += "\\" + sb.WriteString("\\") l.next() case 'u': l.next() - code := "" + var code strings.Builder for i := 0; i < 4; i++ { c := l.peek() if !isHexDigit(c) { return "", errors.New("unfinished unicode escape") } l.next() - code = code + string(c) + code.WriteRune(c) } - intcode, err := strconv.ParseInt(code, 16, 32) + intcode, err := strconv.ParseInt(code.String(), 16, 32) if err != nil { - return "", errors.New("invalid unicode escape: \\u" + code) + return "", errors.New("invalid unicode escape: \\u" + code.String()) } - growingString += string(rune(intcode)) + sb.WriteRune(rune(intcode)) case 'U': l.next() - code := "" + var code strings.Builder for i := 0; i < 8; i++ { c := l.peek() if !isHexDigit(c) { return "", errors.New("unfinished unicode escape") } l.next() - code = code + string(c) + code.WriteRune(c) } - intcode, err := strconv.ParseInt(code, 16, 64) + intcode, err := strconv.ParseInt(code.String(), 16, 64) if err != nil { - return "", errors.New("invalid unicode escape: \\U" + code) + return "", errors.New("invalid unicode escape: \\U" + code.String()) } - growingString += string(rune(intcode)) + sb.WriteRune(rune(intcode)) default: return "", errors.New("invalid escape sequence: \\" + string(l.peek())) } @@ -534,7 +540,7 @@ func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, return "", fmt.Errorf("unescaped control character %U", r) } l.next() - growingString += string(r) + sb.WriteRune(r) } if l.peek() == eof { @@ -769,19 +775,19 @@ func init() { // /!\ also matches the empty string // // Example matches: - //1979-05-27T07:32:00Z - //1979-05-27T00:32:00-07:00 - //1979-05-27T00:32:00.999999-07:00 - //1979-05-27 07:32:00Z - //1979-05-27 00:32:00-07:00 - //1979-05-27 00:32:00.999999-07:00 - //1979-05-27T07:32:00 - //1979-05-27T00:32:00.999999 - //1979-05-27 07:32:00 - //1979-05-27 00:32:00.999999 - //1979-05-27 - //07:32:00 - //00:32:00.999999 + // 1979-05-27T07:32:00Z + // 1979-05-27T00:32:00-07:00 + // 1979-05-27T00:32:00.999999-07:00 + // 1979-05-27 07:32:00Z + // 1979-05-27 00:32:00-07:00 + // 1979-05-27 00:32:00.999999-07:00 + // 1979-05-27T07:32:00 + // 1979-05-27T00:32:00.999999 + // 1979-05-27 07:32:00 + // 1979-05-27 00:32:00.999999 + // 1979-05-27 + // 07:32:00 + // 00:32:00.999999 dateRegexp = regexp.MustCompile(`^(?:\d{1,4}-\d{2}-\d{2})?(?:[T ]?\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})?)?`) }