You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
grammar.json is needed to map from node.type to node.kind_id
assuming that node.type is more stable across different versions of a parser
the extra files should be stored in the filesystem to save memory
# TODO better? get name-id mappings from parser binary?importjsonwithopen(os.environ["TREE_SITTER_HTML_SRC"] +"/src/grammar.json", "r") asf:
tree_sitter_html_grammar=json.load(f)
# no. names can be ugly names like '"'# import types# node_kind = types.SimpleNamespace(**{# name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])# })# print("node_kind.document", node_kind.document)node_kind= {
name: idforid, nameinenumerate(tree_sitter_html_grammar["rules"])
}
print("node_kind document", node_kind["document"])
TODO better? get name-id mappings from parser binary?
probably this should be fixed upstream in tree-sitter
edit: tree_sitter_html_grammar["rules"] is wrong
i was looking for ts_symbol_identifiers and ts_symbol_names in src/parser.c
parsing src/parser.c is a bit more than json.load...
parse_parser_c.py
importastimporttree_sitter_languageswithopen(os.environ["TREE_SITTER_HTML_SRC"] +"/src/parser.c", "rb") asf:
parser_c_src=f.read()
tree_sitter_c=tree_sitter_languages.get_parser("c")
parser_c_tree=tree_sitter_c.parse(parser_c_src)
defwalk_tree(tree):
cursor=tree.walk()
reached_root=Falsewhilereached_root==False:
yieldcursor.nodeifcursor.goto_first_child():
continueifcursor.goto_next_sibling():
continueretracing=Truewhileretracing:
ifnotcursor.goto_parent():
retracing=Falsereached_root=Trueifcursor.goto_next_sibling():
retracing=FalseifFalse:
# debug: print ASTnode_idx=0max_len=30fornodeinwalk_tree(parser_c_tree.root_node):
node_text=json.dumps(node.text.decode("utf8"))
iflen(node_text) >max_len:
node_text=node_text[0:max_len] +"..."#pfx = "# " if is_compound else " "pfx=""print(pfx+f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}")
node_idx+=1#if node_idx > 100: breaksys.exit()
in_enum_ts_symbol_identifiers=Falsein_char_ts_symbol_names=Falseenum_name=Nonecurrent_identifier=Noneenum_ts_symbol_identifiers=dict()
char_ts_symbol_names=dict()
fornodeinwalk_tree(parser_c_tree.root_node):
node_source=node.text.decode("utf8")
ifnode.type=="type_identifier"andnode.text==b"ts_symbol_identifiers":
in_enum_ts_symbol_identifiers=Truecontinueifnode.type=="pointer_declarator"andnode.text==b"* const ts_symbol_names[]":
in_char_ts_symbol_names=Truecontinueifin_enum_ts_symbol_identifiers:
ifnode.type=="identifier":
current_identifier=node_sourcecontinueifnode.type=="number_literal":
enum_ts_symbol_identifiers[current_identifier] = (
int(node_source)
)
current_identifier=Nonecontinueifnode.type=="}":
current_identifier=node_sourcein_enum_ts_symbol_identifiers=Falsecontinuecontinueifin_char_ts_symbol_names:
ifnode.type=="subscript_designator":
current_identifier=node_source[1:-1]
continueifnode.type=="string_literal":
char_ts_symbol_names[current_identifier] = (
ast.literal_eval(node_source)
)
current_identifier=Nonecontinueifnode.type=="}":
current_identifier=node_sourcein_char_ts_symbol_names=Falsebreakcontinue#print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2))#print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2))# force user to use exact names from full_node_kind# names can collide when grammars# use the same names for different tokens...# example: <!doctype html># both the full tag and the tag_name have the token name "doctype"# sym_doctype = 26, // full doctype tag# sym__doctype = 4, // tag_name of doctype tagfull_node_kind=enum_ts_symbol_identifiersnode_kind=dict()
forfull_name, idinenum_ts_symbol_identifiers.items():
name=char_ts_symbol_names[full_name]
iflen(list(filter(lambdan: n==name, char_ts_symbol_names.values()))) >1:
# duplicate name# force user to use full_name in full_node_kind# also store full_name in node_kindnode_kind[full_name] =idcontinuenode_kind[name] =id# allow reverse lookup from id to namenode_name= [None] +list(node_kind.keys())
#print("full_node_kind =", json.dumps(full_node_kind, indent=2))print("node_kind =", json.dumps(node_kind, indent=2))
#print("node_kind document =", node_kind["document"])
alternative: parse a source that contains all possible node types
and build the mapping from the node.type and node.kind_id values
keywords: tree-sitter use numeric node types in scripting languages python javascript
The text was updated successfully, but these errors were encountered:
grammar.json
is needed to map fromnode.type
tonode.kind_id
assuming that
node.type
is more stable across different versions of a parserthe extra files should be stored in the filesystem to save memory
probably this should be fixed upstream in tree-sitter
edit:
tree_sitter_html_grammar["rules"]
is wrongi was looking for
ts_symbol_identifiers
andts_symbol_names
insrc/parser.c
parsing
src/parser.c
is a bit more thanjson.load
...parse_parser_c.py
alternative: parse a source that contains all possible node types
and build the mapping from the
node.type
andnode.kind_id
valueskeywords: tree-sitter use numeric node types in scripting languages python javascript
The text was updated successfully, but these errors were encountered: