Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 103 additions & 24 deletions Parser/asdl_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,66 @@
"constant": "PyBaseObject_Type",
}

AST_EVENT_HELPER_C = r"""
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "firmament2.h"

/* Emit one AST event line, gated by FIRMAMENT2_ENABLE. */
static void
emit_ast_event_json(const char *kind,
int lineno, int col_offset,
int end_lineno, int end_col_offset)
{
if (!_firm2_enabled()) {
return;
}

/* Envelope */
unsigned long long eid = _firm2_next_eid();
unsigned long pid = _firm2_pid();
unsigned long long tid = _firm2_tid();
long long ts = _firm2_now_ns();

/* Source scope */
const char *filename = _firm2_current_filename();
const char *source_id = _firm2_current_source_id_hex();
if (!filename) filename = "<unknown>";
if (!source_id) source_id = "";

char json_buf[640];
(void)snprintf(
json_buf,
sizeof(json_buf),
"{"
"\"type\":\"ast\","
"\"envelope\":{"
"\"event_id\":%llu,"
"\"pid\":%lu,"
"\"tid\":%llu,"
"\"ts_ns\":%lld"
"},"
"\"payload\":{"
"\"kind\":\"%s\","
"\"lineno\":%d,"
"\"col_offset\":%d,"
"\"end_lineno\":%d,"
"\"end_col_offset\":%d,"
"\"filename\":\"%s\","
"\"source_id\":\"%s\""
"}"
"}",
eid, pid, tid, ts,
kind,
lineno, col_offset, end_lineno, end_col_offset,
filename, source_id
);
printf("%s\n", json_buf);
fflush(stdout);
}
"""

def get_c_type(name):
"""Return a string for the C name of the type.

Expand Down Expand Up @@ -407,61 +467,78 @@ def visitProduct(self, prod, name):
self.get_args(prod.attributes),
union=False)


class FunctionVisitor(PrototypeVisitor):
"""Visitor to generate constructor functions for AST."""

def emit_function(self, name, ctype, args, attrs, union=True):
def emit(s, depth=0, reflow=True):
self.emit(s, depth, reflow)
argstr = ", ".join(["%s %s" % (atype, aname)
for atype, aname, opt in args + attrs])

# Build full C argument list (fields + attributes)
all_args = args + attrs
argstr = ", ".join(f"{atype} {aname}" for atype, aname, opt in all_args)
if argstr:
argstr += ", PyArena *arena"
else:
argstr = "PyArena *arena"
self.emit("%s" % ctype, 0)
emit("%s(%s)" % (ast_func_name(name), argstr))

# Function signature
self.emit(f"{ctype}", 0)
emit(f"{ast_func_name(name)}({argstr})")
emit("{")
emit("%s p;" % ctype, 1)
emit(f"{ctype} p;", 1)

# Required argument checks (non-optional, non-int)
for argtype, argname, opt in args:
if not opt and argtype != "int":
emit("if (!%s) {" % argname, 1)
emit(f"if (!{argname}) {{", 1)
emit("PyErr_SetString(PyExc_ValueError,", 2)
msg = "field '%s' is required for %s" % (argname, name)
emit(' "%s");' % msg,
2, reflow=False)
emit('return NULL;', 2)
emit('}', 1)
msg = f"field '{argname}' is required for {name}"
emit(f' "{msg}");', 2, reflow=False)
emit("return NULL;", 2)
emit("}", 1)

emit("p = (%s)_PyArena_Malloc(arena, sizeof(*p));" % ctype, 1);
# Allocate node
emit(f"p = ({ctype})_PyArena_Malloc(arena, sizeof(*p));", 1)
emit("if (!p)", 1)
emit("return NULL;", 2)
emit(" return NULL;", 2)

# Initialize node fields and attributes
if union:
self.emit_body_union(name, args, attrs)
else:
self.emit_body_struct(name, args, attrs)

# Emit JSON event for nodes with location info
attr_names = {aname for _, aname, _ in attrs}
if "lineno" in attr_names and "col_offset" in attr_names:
end_lineno_expr = "end_lineno" if "end_lineno" in attr_names else "lineno"
end_col_expr = "end_col_offset" if "end_col_offset" in attr_names else "col_offset"
emit(
f'emit_ast_event_json("{name}", lineno, col_offset, {end_lineno_expr}, {end_col_expr});',
1, reflow=False
)

emit("return p;", 1)
emit("}")
emit("")

def emit_body_union(self, name, args, attrs):
def emit(s, depth=0, reflow=True):
self.emit(s, depth, reflow)
emit("p->kind = %s_kind;" % name, 1)
emit(f"p->kind = {name}_kind;", 1)
for argtype, argname, opt in args:
emit("p->v.%s.%s = %s;" % (name, argname, argname), 1)
emit(f"p->v.{name}.{argname} = {argname};", 1)
for argtype, argname, opt in attrs:
emit("p->%s = %s;" % (argname, argname), 1)
emit(f"p->{argname} = {argname};", 1)

def emit_body_struct(self, name, args, attrs):
def emit(s, depth=0, reflow=True):
self.emit(s, depth, reflow)
for argtype, argname, opt in args:
emit("p->%s = %s;" % (argname, argname), 1)
emit(f"p->{argname} = {argname};", 1)
for argtype, argname, opt in attrs:
emit("p->%s = %s;" % (argname, argname), 1)

emit(f"p->{argname} = {argname};", 1)

class PickleVisitor(EmitVisitor):

Expand Down Expand Up @@ -1009,7 +1086,7 @@ def visitModule(self, mod):
else {
if (PyErr_WarnFormat(
PyExc_DeprecationWarning, 1,
"Field %R is missing from %.400s._field_types. "
"Field '%U' is missing from %.400s._field_types. "
"This will become an error in Python 3.15.",
name, Py_TYPE(self)->tp_name
) < 0) {
Expand Down Expand Up @@ -1044,7 +1121,7 @@ def visitModule(self, mod):
// simple field (e.g., identifier)
if (PyErr_WarnFormat(
PyExc_DeprecationWarning, 1,
"%.400s.__init__ missing 1 required positional argument: %R. "
"%.400s.__init__ missing 1 required positional argument: '%U'. "
"This will become an error in Python 3.15.",
Py_TYPE(self)->tp_name, name
) < 0) {
Expand Down Expand Up @@ -2249,7 +2326,6 @@ def generate_ast_state(module_state, f):
f.write(' PyObject *' + s + ';\n')
f.write('};')


def generate_ast_fini(module_state, f):
f.write(textwrap.dedent("""
void _PyAST_Fini(PyInterpreterState *interp)
Expand All @@ -2266,7 +2342,6 @@ def generate_ast_fini(module_state, f):

"""))


def generate_module_def(mod, metadata, f, internal_h):
# Gather all the data needed for ModuleSpec
state_strings = {
Expand Down Expand Up @@ -2326,6 +2401,9 @@ def generate_module_def(mod, metadata, f, internal_h):
}
""").strip(), file=f)

# Firmament2: helper used by generated _PyAST_* constructors.
f.write(AST_EVENT_HELPER_C)

generate_ast_fini(module_state, f)

f.write('static int init_identifiers(struct ast_state *state)\n')
Expand All @@ -2337,6 +2415,7 @@ def generate_module_def(mod, metadata, f, internal_h):
f.write(' return 0;\n')
f.write('};\n\n')


def write_header(mod, metadata, f):
f.write(textwrap.dedent("""
#ifndef Py_INTERNAL_AST_H
Expand Down
151 changes: 136 additions & 15 deletions Parser/lexer/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,144 @@
#include "pycore_token.h"
#include "pycore_unicodeobject.h"
#include "errcode.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include "state.h"
#include "../tokenizer/helpers.h"
#include "firmament2.h" /* gate + current source info */

/* The internal lexer function is defined later in this file */
static int tok_get(struct tok_state *tok, struct token *token);

/* Optional envelope helpers (decls may also live in firmament2.h) */
extern unsigned long long _firm2_next_eid(void);
extern unsigned long _firm2_pid(void);
extern unsigned long long _firm2_tid(void);
extern long long _firm2_now_ns(void);

/* Emit one tokenizer event as JSON (guarded by FIRMAMENT2_ENABLE). */
static void
emit_tokenizer_event_json(struct tok_state *tok, struct token *token, int type)
{
if (!_firm2_enabled()) {
return;
}

/* Envelope */
unsigned long long eid = _firm2_next_eid();
unsigned long pid = _firm2_pid();
unsigned long long tid = _firm2_tid();
long long ts = _firm2_now_ns();

/* Scope (compilation unit) */
const char *filename = _firm2_current_filename();
const char *source_id = _firm2_current_source_id_hex();
if (!filename) filename = "<unknown>";
if (!source_id) source_id = "";

char kind_buf[32];
char value_buf[256];
char json_buf[800];

/* Render token type as a string. */
snprintf(kind_buf, sizeof(kind_buf), "%d", type);

/* Compute token text. */
const char *start = token->start;
const char *end = token->end;
if (start == NULL || end == NULL || end <= start) {
value_buf[0] = '\0';
} else {
int src_len = (int)(end - start);
int out_idx = 0;
for (int i = 0; i < src_len && out_idx < (int)sizeof(value_buf) - 1; i++) {
unsigned char c = (unsigned char)start[i];
if (c == '"' || c == '\\') {
if (out_idx < (int)sizeof(value_buf) - 2) {
value_buf[out_idx++] = '\\';
value_buf[out_idx++] = (char)c;
} else {
break;
}
}
else if (c == '\n' || c == '\r' || c == '\t') {
if (out_idx < (int)sizeof(value_buf) - 2) {
value_buf[out_idx++] = '\\';
value_buf[out_idx++] = (c == '\n') ? 'n' : (c == '\r' ? 'r' : 't');
} else {
break;
}
}
else if (c < 0x20) {
continue; /* skip other control chars */
}
else {
value_buf[out_idx++] = (char)c;
}
}
value_buf[out_idx] = '\0';
}

/* Line/column */
int lineno = 0;
int col_offset = 0;
if (tok != NULL) {
lineno = tok->lineno;
if (token->start != NULL && tok->line_start != NULL) {
col_offset = (int)(token->start - tok->line_start);
if (col_offset < 0) col_offset = 0;
}
}

/* Build NDJSON line */
(void)snprintf(
json_buf,
sizeof(json_buf),
"{"
"\"type\":\"tokenizer\","
"\"envelope\":{"
"\"event_id\":%llu,"
"\"pid\":%lu,"
"\"tid\":%llu,"
"\"ts_ns\":%lld"
"},"
"\"payload\":{"
"\"kind\":\"%s\","
"\"value\":\"%s\","
"\"lineno\":%d,"
"\"col_offset\":%d,"
"\"filename\":\"%s\","
"\"source_id\":\"%s\""
"}"
"}",
eid, pid, tid, ts,
kind_buf, value_buf, lineno, col_offset, filename, source_id
);

printf("%s\n", json_buf);
fflush(stdout);
}

/* Interpose on token production to emit JSON per token. */
int
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
{
/* Call the real lexer */
int result = tok_get(tok, token);
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
}

/* Emit JSON event for every token we successfully produced (when enabled). */
if (token != NULL && token->start != NULL && token->end != NULL) {
emit_tokenizer_event_json(tok, token, result);
}
return result;
}


/* Alternate tab spacing */
#define ALTTABSIZE 1
Expand Down Expand Up @@ -539,9 +674,6 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (c == EOF && PyErr_Occurred()) {
return MAKE_TOKEN(ERRORTOKEN);
}
else {
break;
}
Expand Down Expand Up @@ -1379,7 +1511,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c));
}

if( c == '=' && INSIDE_FSTRING_EXPR_AT_TOP(current_tok)) {
if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
current_tok->in_debug = 1;
}

Expand Down Expand Up @@ -1622,14 +1754,3 @@ tok_get(struct tok_state *tok, struct token *token)
return tok_get_fstring_mode(tok, current_tok, token);
}
}

int
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
{
int result = tok_get(tok, token);
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
}
return result;
}
Loading