Add semantic tokens for escape sequences; fix escape sequences in tex…

…tmate grammar
kralicky · Apr 2, 2024 · d4dcfff · d4dcfff
1 parent 65b87a7
commit d4dcfff
Show file tree

Hide file tree

Showing 5 changed files with 173 additions and 6 deletions.
diff --git a/editors/vscode/syntaxes/protobuf.tmLanguage.json b/editors/vscode/syntaxes/protobuf.tmLanguage.json
@@ -47,12 +47,34 @@
         {
           "name": "string.quoted.double.protobuf",
           "begin": "\"",
-          "end": "\""
+          "end": "\"",
+          "patterns": [
+            {
+              "include": "#string_escape_char"
+            }
+          ]
         },
         {
           "name": "string.quoted.single.protobuf",
           "begin": "'",
-          "end": "'"
+          "end": "'",
+          "patterns": [
+            {
+              "include": "#string_escape_char"
+            }
+          ]
+        }
+      ]
+    },
+    "string_escape_char": {
+      "patterns": [
+        {
+          "match": "\\\\([0-7]{1,3}|[abfnrtv\\\\'\"]|[xX][0-9a-fA-F]{1,2}|u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})",
+          "name": "constant.character.escape.protobuf"
+        },
+        {
+          "match": "\\\\[^0-7xuUabfnrtv\\'\"]",
+          "name": "invalid.illegal.unknown-escape.protobuf"
         }
       ]
     },

diff --git a/go.mod b/go.mod
@@ -11,7 +11,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/kralicky/codegen v0.0.0-20240307225947-51de80fcb2f3
 	github.com/kralicky/gpkg v0.0.0-20240119195700-64f32830b14f
-	github.com/kralicky/protocompile v0.0.0-20240401205051-3af207f8546c
+	github.com/kralicky/protocompile v0.0.0-20240402185503-7487e2c98aec
 	github.com/kralicky/tools-lite v0.0.0-20240313161632-60bfa88304ff
 	github.com/mattn/go-tty v0.0.5
 	github.com/spf13/cobra v1.8.0

diff --git a/go.sum b/go.sum
@@ -42,8 +42,8 @@ github.com/kralicky/codegen v0.0.0-20240307225947-51de80fcb2f3 h1:BWm3t2mZaeu4hg
 github.com/kralicky/codegen v0.0.0-20240307225947-51de80fcb2f3/go.mod h1:R99TvehuNoyLOBwWThdNuQXnJT8d4gJcRYfZtB2Mw7s=
 github.com/kralicky/gpkg v0.0.0-20240119195700-64f32830b14f h1:MsNe8A51V+7Fu5OMXSl8SK02erPJ40vFs2zDHn89w1g=
 github.com/kralicky/gpkg v0.0.0-20240119195700-64f32830b14f/go.mod h1:vOkwMjs49XmP/7Xfo9ZL6eg2ei51lmtD/4U/Az5GTq8=
-github.com/kralicky/protocompile v0.0.0-20240401205051-3af207f8546c h1:HrrAwQQDVyDFYbGFKNhOYtUd7HTlu/Y+qVGiVWhEiwg=
-github.com/kralicky/protocompile v0.0.0-20240401205051-3af207f8546c/go.mod h1:UjvEi6Zifxkfhm/hW7RXpluHcBfN+VnePXcUtgHws8U=
+github.com/kralicky/protocompile v0.0.0-20240402185503-7487e2c98aec h1:flnp8h/gqns0/a2MnQohtQiPTMNnXRbBZ0vFKEU92V8=
+github.com/kralicky/protocompile v0.0.0-20240402185503-7487e2c98aec/go.mod h1:UjvEi6Zifxkfhm/hW7RXpluHcBfN+VnePXcUtgHws8U=
 github.com/kralicky/tools-lite v0.0.0-20240313161632-60bfa88304ff h1:akxm/czMYHdr1xIGm6wmddABmc4M9KDcqksdwpIJx8A=
 github.com/kralicky/tools-lite v0.0.0-20240313161632-60bfa88304ff/go.mod h1:V8GGYRLr40bvX/W3nZFxG+6S3iDFWn6o5J3NGDClr8U=
 github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=

diff --git a/pkg/lsp/semantic.go b/pkg/lsp/semantic.go
@@ -1,9 +1,11 @@
 package lsp
 
 import (
+	"bytes"
 	"fmt"
 	"log/slog"
 	"os"
+	"regexp"
 	"slices"
 	"sort"
 	"strings"
@@ -440,6 +442,8 @@ func init() {
 	celEnv, _ = celEnv.Extend(cel.EnableMacroCallTracking())
 }
 
+var escapeCharRegex = regexp.MustCompile(`\\([0-7]{1,3}|[abfnrtv\\'"]|[xX][0-9a-fA-F]{1,2}|u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})`)
+
 func (s *semanticItems) inspect(node ast.Node, walkOptions ...ast.WalkOption) {
 	tracker := &paths.AncestorTracker{}
 	// check if node is a non-nil interface to a nil pointer
@@ -459,7 +463,11 @@ func (s *semanticItems) inspect(node ast.Node, walkOptions ...ast.WalkOption) {
 				s.mkcomments(node)
 				return true
 			}
-			s.mktokens(node, tracker.Path(), semanticTypeString, 0)
+			if bytes.Contains(node.Raw, []byte{'\\'}) {
+				s.inspectStringLiteralWithEscapeSequences(node, tracker.Path())
+			} else {
+				s.mktokens(node, tracker.Path(), semanticTypeString, 0)
+			}
 		case *ast.UintLiteralNode:
 			s.mktokens(node, tracker.Path(), semanticTypeNumber, 0)
 		case *ast.FloatLiteralNode:
@@ -681,6 +689,90 @@ func (s *semanticItems) inspectCompoundIdent(compoundIdent *ast.CompoundIdentNod
 	}
 }
 
+func (s *semanticItems) inspectStringLiteralWithEscapeSequences(node *ast.StringLiteralNode, path protopath.Path) {
+	// for strings containing escape sequences, create multiple tokens
+	// for each part of the string
+	// example: "\0\001\a\b\f\n\r\t\v\\\'\"\xfe" -> ["\0", "\001", "\a", "\b", "\f", "\n", "\r", "\t", "\v", "\\", "\'", "\"", "\xfe"]
+	indexes := escapeCharRegex.FindAllIndex(node.Raw, -1)
+
+	info := s.AST().NodeInfo(node)
+	if !info.IsValid() {
+		return
+	}
+
+	line := uint32(info.Start().Line - 1)
+	start := uint32(info.Start().Col - 1)
+
+	// first token is the opening quote
+	s.items = append(s.items, semanticItem{
+		lang:  tokenLanguageProto,
+		line:  line,
+		start: start,
+		len:   1,
+		typ:   semanticTypeString,
+		node:  node,
+		path:  path,
+	})
+	start++
+
+	i := 0
+	for _, match := range indexes {
+		if i < match[0] {
+			// regular string
+			item := semanticItem{
+				lang:  tokenLanguageProto,
+				line:  line,
+				start: start + uint32(i),
+				len:   uint32(match[0]) - uint32(i),
+				typ:   semanticTypeString,
+				node:  node,
+				path:  path,
+			}
+			s.items = append(s.items, item)
+			i = match[0]
+		}
+		if i == match[0] {
+			// escape sequence
+			item := semanticItem{
+				lang:  tokenLanguageProto,
+				line:  line,
+				start: start + uint32(i),
+				len:   uint32(match[1]) - uint32(match[0]),
+				typ:   semanticTypeRegexp,
+				node:  node,
+				path:  path,
+			}
+			s.items = append(s.items, item)
+			i = match[1]
+		}
+	}
+	if i < len(node.Raw)-1 {
+		// after the last escape sequence but before the closing quote
+		item := semanticItem{
+			lang:  tokenLanguageProto,
+			line:  line,
+			start: start + uint32(i),
+			len:   uint32(len(node.Raw) - i),
+			typ:   semanticTypeString,
+			node:  node,
+			path:  path,
+		}
+		s.items = append(s.items, item)
+		i = len(node.Raw) - 1
+	}
+
+	// last token is the closing quote
+	s.items = append(s.items, semanticItem{
+		lang:  tokenLanguageProto,
+		line:  line,
+		start: start + uint32(i),
+		len:   1,
+		typ:   semanticTypeString,
+		node:  node,
+		path:  path,
+	})
+}
+
 func (s *semanticItems) inspectFieldLiteral(node ast.Node, val *ast.ValueNode, path protopath.Path) {
 	if s.maybeLinkRes == nil {
 		return

diff --git a/test/semantic_test.go b/test/semantic_test.go
@@ -69,6 +69,9 @@ message Y {
       x: {[bufbuild.protocompile.test2.y]: {x: {}}}
     }
   ];
+	optional bytes escaped_bytes = 3 [default = "\0\001\a\b\f\n\r\t\v\\\'\"\xfe"];
+	optional string utf8_string = 4 [default = "\341\210\264"]; // this is utf-8 for \u1234
+	optional string mixed_string = 5 [default = "foo\xFFbar\u1234baz\t"];
 }
 
 `
@@ -235,6 +238,56 @@ message Y {
 			{Token: "bufbuild.protocompile.test2.y", TokenType: "property"},
 			{Token: "]", TokenType: "operator"},
 			{Token: "x", TokenType: "property"},
+			{Token: "optional", TokenType: "keyword"},
+			{Token: "bytes", TokenType: "type", Mod: "defaultLibrary"},
+			{Token: "escaped_bytes", TokenType: "variable", Mod: "definition"},
+			{Token: "=", TokenType: "operator"},
+			{Token: "3", TokenType: "number"},
+			{Token: "default", TokenType: "keyword"},
+			{Token: "=", TokenType: "operator"},
+			{Token: `"`, TokenType: "string"},
+			{Token: `\0`, TokenType: "regexp"},
+			{Token: `\001`, TokenType: "regexp"},
+			{Token: `\a`, TokenType: "regexp"},
+			{Token: `\b`, TokenType: "regexp"},
+			{Token: `\f`, TokenType: "regexp"},
+			{Token: `\n`, TokenType: "regexp"},
+			{Token: `\r`, TokenType: "regexp"},
+			{Token: `\t`, TokenType: "regexp"},
+			{Token: `\v`, TokenType: "regexp"},
+			{Token: `\\`, TokenType: "regexp"},
+			{Token: `\'`, TokenType: "regexp"},
+			{Token: `\"`, TokenType: "regexp"},
+			{Token: `\xfe`, TokenType: "regexp"},
+			{Token: `"`, TokenType: "string"},
+			{Token: "optional", TokenType: "keyword"},
+			{Token: "string", TokenType: "type", Mod: "defaultLibrary"},
+			{Token: "utf8_string", TokenType: "variable", Mod: "definition"},
+			{Token: "=", TokenType: "operator"},
+			{Token: "4", TokenType: "number"},
+			{Token: "default", TokenType: "keyword"},
+			{Token: "=", TokenType: "operator"},
+			{Token: `"`, TokenType: "string"},
+			{Token: `\341`, TokenType: "regexp"},
+			{Token: `\210`, TokenType: "regexp"},
+			{Token: `\264`, TokenType: "regexp"},
+			{Token: `"`, TokenType: "string"},
+			{Token: "// this is utf-8 for \\u1234", TokenType: "comment"},
+			{Token: "optional", TokenType: "keyword"},
+			{Token: "string", TokenType: "type", Mod: "defaultLibrary"},
+			{Token: "mixed_string", TokenType: "variable", Mod: "definition"},
+			{Token: "=", TokenType: "operator"},
+			{Token: "5", TokenType: "number"},
+			{Token: "default", TokenType: "keyword"},
+			{Token: "=", TokenType: "operator"},
+			{Token: `"`, TokenType: "string"},
+			{Token: "foo", TokenType: "string"},
+			{Token: `\xFF`, TokenType: "regexp"},
+			{Token: "bar", TokenType: "string"},
+			{Token: `\u1234`, TokenType: "regexp"},
+			{Token: "baz", TokenType: "string"},
+			{Token: `\t`, TokenType: "regexp"},
+			{Token: `"`, TokenType: "string"},
 		}
 		if x := cmp.Diff(want, tokens); x != "" {
 			t.Errorf("Semantic tokens do not match (-want +got):\n%s", x)