Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

Open
milahu opened this issue Mar 5, 2024 · 0 comments

Comments

@milahu
Copy link

milahu commented Mar 5, 2024

grammar.json is needed to map from node.type to node.kind_id
assuming that node.type is more stable across different versions of a parser

the extra files should be stored in the filesystem to save memory

# TODO better? get name-id mappings from parser binary?

import json
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/grammar.json", "r") as f:
    tree_sitter_html_grammar = json.load(f)

# no. names can be ugly names like '"'
# import types
# node_kind = types.SimpleNamespace(**{
#     name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
# })
# print("node_kind.document", node_kind.document)

node_kind = {
    name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
}

print("node_kind document", node_kind["document"])

TODO better? get name-id mappings from parser binary?

probably this should be fixed upstream in tree-sitter

edit: tree_sitter_html_grammar["rules"] is wrong
i was looking for ts_symbol_identifiers and ts_symbol_names in src/parser.c

enum ts_symbol_identifiers {
  anon_sym_LT_BANG = 1,
  aux_sym_doctype_token1 = 2,
  anon_sym_GT = 3,
static const char * const ts_symbol_names[] = {
  [ts_builtin_sym_end] = "end",
  [anon_sym_LT_BANG] = "<!",
  [aux_sym_doctype_token1] = "doctype_token1",
  [anon_sym_GT] = ">",

parsing src/parser.c is a bit more than json.load...

parse_parser_c.py
import ast
import tree_sitter_languages

with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/parser.c", "rb") as f:
    parser_c_src = f.read()

tree_sitter_c = tree_sitter_languages.get_parser("c")
parser_c_tree = tree_sitter_c.parse(parser_c_src)

def walk_tree(tree):
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        yield cursor.node
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

if False:

    # debug: print AST

    node_idx = 0
    max_len = 30

    for node in walk_tree(parser_c_tree.root_node):

        node_text = json.dumps(node.text.decode("utf8"))
        if len(node_text) > max_len:
            node_text = node_text[0:max_len] + "..."

        #pfx = "# " if is_compound else "  "
        pfx = ""
        print(pfx + f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}")

        node_idx += 1
        #if node_idx > 100: break

    sys.exit()

in_enum_ts_symbol_identifiers = False
in_char_ts_symbol_names = False
enum_name = None
current_identifier = None
enum_ts_symbol_identifiers = dict()
char_ts_symbol_names = dict()


for node in walk_tree(parser_c_tree.root_node):

    node_source = node.text.decode("utf8")

    if node.type == "type_identifier" and node.text == b"ts_symbol_identifiers":
        in_enum_ts_symbol_identifiers = True
        continue

    if node.type == "pointer_declarator" and node.text == b"* const ts_symbol_names[]":
        in_char_ts_symbol_names = True
        continue

    if in_enum_ts_symbol_identifiers:

        if node.type == "identifier":
            current_identifier = node_source
            continue

        if node.type == "number_literal":
            enum_ts_symbol_identifiers[current_identifier] = (
                int(node_source)
            )
            current_identifier = None
            continue

        if node.type == "}":
            current_identifier = node_source
            in_enum_ts_symbol_identifiers = False
            continue

        continue

    if in_char_ts_symbol_names:

        if node.type == "subscript_designator":
            current_identifier = node_source[1:-1]
            continue

        if node.type == "string_literal":
            char_ts_symbol_names[current_identifier] = (
                ast.literal_eval(node_source)
            )
            current_identifier = None
            continue

        if node.type == "}":
            current_identifier = node_source
            in_char_ts_symbol_names = False
            break

        continue


#print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2))
#print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2))

# force user to use exact names from full_node_kind
# names can collide when grammars
# use the same names for different tokens...
# example: <!doctype html>
# both the full tag and the tag_name have the token name "doctype"
#   sym_doctype = 26, // full doctype tag
#   sym__doctype = 4, // tag_name of doctype tag

full_node_kind = enum_ts_symbol_identifiers
node_kind = dict()
for full_name, id in enum_ts_symbol_identifiers.items():
    name = char_ts_symbol_names[full_name]
    if len(list(filter(lambda n: n == name, char_ts_symbol_names.values()))) > 1:
        # duplicate name
        # force user to use full_name in full_node_kind
        # also store full_name in node_kind
        node_kind[full_name] = id
        continue
    node_kind[name] = id

# allow reverse lookup from id to name
node_name = [None] + list(node_kind.keys())

#print("full_node_kind =", json.dumps(full_node_kind, indent=2))
print("node_kind =", json.dumps(node_kind, indent=2))
#print("node_kind document =", node_kind["document"])

alternative: parse a source that contains all possible node types
and build the mapping from the node.type and node.kind_id values

keywords: tree-sitter use numeric node types in scripting languages python javascript

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant