expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

milahu · 2024-03-05T17:53:34Z

grammar.json is needed to map from node.type to node.kind_id
assuming that node.type is more stable across different versions of a parser

the extra files should be stored in the filesystem to save memory

# TODO better? get name-id mappings from parser binary?

import json
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/grammar.json", "r") as f:
    tree_sitter_html_grammar = json.load(f)

# no. names can be ugly names like '"'
# import types
# node_kind = types.SimpleNamespace(**{
#     name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
# })
# print("node_kind.document", node_kind.document)

node_kind = {
    name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
}

print("node_kind document", node_kind["document"])

TODO better? get name-id mappings from parser binary?

probably this should be fixed upstream in tree-sitter

edit: tree_sitter_html_grammar["rules"] is wrong
i was looking for ts_symbol_identifiers and ts_symbol_names in src/parser.c

enum ts_symbol_identifiers {
  anon_sym_LT_BANG = 1,
  aux_sym_doctype_token1 = 2,
  anon_sym_GT = 3,

static const char * const ts_symbol_names[] = {
  [ts_builtin_sym_end] = "end",
  [anon_sym_LT_BANG] = "<!",
  [aux_sym_doctype_token1] = "doctype_token1",
  [anon_sym_GT] = ">",

parsing src/parser.c is a bit more than json.load...

parse_parser_c.py

import ast
import tree_sitter_languages

with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/parser.c", "rb") as f:
    parser_c_src = f.read()

tree_sitter_c = tree_sitter_languages.get_parser("c")
parser_c_tree = tree_sitter_c.parse(parser_c_src)

def walk_tree(tree):
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        yield cursor.node
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

if False:

    # debug: print AST

    node_idx = 0
    max_len = 30

    for node in walk_tree(parser_c_tree.root_node):

        node_text = json.dumps(node.text.decode("utf8"))
        if len(node_text) > max_len:
            node_text = node_text[0:max_len] + "..."

        #pfx = "# " if is_compound else "  "
        pfx = ""
        print(pfx + f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}")

        node_idx += 1
        #if node_idx > 100: break

    sys.exit()

in_enum_ts_symbol_identifiers = False
in_char_ts_symbol_names = False
enum_name = None
current_identifier = None
enum_ts_symbol_identifiers = dict()
char_ts_symbol_names = dict()


for node in walk_tree(parser_c_tree.root_node):

    node_source = node.text.decode("utf8")

    if node.type == "type_identifier" and node.text == b"ts_symbol_identifiers":
        in_enum_ts_symbol_identifiers = True
        continue

    if node.type == "pointer_declarator" and node.text == b"* const ts_symbol_names[]":
        in_char_ts_symbol_names = True
        continue

    if in_enum_ts_symbol_identifiers:

        if node.type == "identifier":
            current_identifier = node_source
            continue

        if node.type == "number_literal":
            enum_ts_symbol_identifiers[current_identifier] = (
                int(node_source)
            )
            current_identifier = None
            continue

        if node.type == "}":
            current_identifier = node_source
            in_enum_ts_symbol_identifiers = False
            continue

        continue

    if in_char_ts_symbol_names:

        if node.type == "subscript_designator":
            current_identifier = node_source[1:-1]
            continue

        if node.type == "string_literal":
            char_ts_symbol_names[current_identifier] = (
                ast.literal_eval(node_source)
            )
            current_identifier = None
            continue

        if node.type == "}":
            current_identifier = node_source
            in_char_ts_symbol_names = False
            break

        continue


#print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2))
#print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2))

# force user to use exact names from full_node_kind
# names can collide when grammars
# use the same names for different tokens...
# example: <!doctype html>
# both the full tag and the tag_name have the token name "doctype"
#   sym_doctype = 26, // full doctype tag
#   sym__doctype = 4, // tag_name of doctype tag

full_node_kind = enum_ts_symbol_identifiers
node_kind = dict()
for full_name, id in enum_ts_symbol_identifiers.items():
    name = char_ts_symbol_names[full_name]
    if len(list(filter(lambda n: n == name, char_ts_symbol_names.values()))) > 1:
        # duplicate name
        # force user to use full_name in full_node_kind
        # also store full_name in node_kind
        node_kind[full_name] = id
        continue
    node_kind[name] = id

# allow reverse lookup from id to name
node_name = [None] + list(node_kind.keys())

#print("full_node_kind =", json.dumps(full_node_kind, indent=2))
print("node_kind =", json.dumps(node_kind, indent=2))
#print("node_kind document =", node_kind["document"])

alternative: parse a source that contains all possible node types
and build the mapping from the node.type and node.kind_id values

keywords: tree-sitter use numeric node types in scripting languages python javascript

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

milahu commented Mar 5, 2024 •

edited

Loading

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

Comments

milahu commented Mar 5, 2024 • edited Loading

milahu commented Mar 5, 2024 •

edited

Loading