From c3e0937bcfd040cafe10ad7b4cc95ffc56285639 Mon Sep 17 00:00:00 2001 From: Nico Bliss Date: Mon, 24 Jun 2024 11:24:00 -0700 Subject: [PATCH 1/6] added token tracking --- .gitignore | 3 ++- src/lexer/lex.c | 12 +++++++++++- src/lexer/lex.h | 1 + src/lexer/token.h | 8 +++++--- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index d163863..9a9082b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -build/ \ No newline at end of file +build/ +shell.nix \ No newline at end of file diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 17795bf..5d3f844 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -1,4 +1,6 @@ #include "lex.h" +#include "token.h" +#include #include // tassert #include @@ -66,12 +68,16 @@ int real_lex(Lexer *l, Token *t) { // Clear memory and initialize memset(t->contents, 0, TOKEN_LENGTH); + // Set sourcefile + memcpy(t->source_file, &l->current_file, TOKEN_LENGTH); + // First important check -- have we reached the end of the file? static char eof[] = "[end of file]"; if (init == EOF) { strcpy(t->contents, eof); t->length = strlen(eof); t->type = TT_EOF; + t->position_in_file = ftell(l->fp); return 0; } @@ -88,6 +94,7 @@ int real_lex(Lexer *l, Token *t) { strcpy(t->contents, nline); t->length = strlen(nline); t->type = TT_NEWLINE; + t->position_in_file = ftell(l->fp); return 0; } @@ -111,6 +118,7 @@ int real_lex(Lexer *l, Token *t) { if (in_string(init, single_char_tokens)) { t->length = pos; t->type = ttype_one_char(init); + t->position_in_file = ftell(l->fp); return 0; } @@ -118,7 +126,9 @@ int real_lex(Lexer *l, Token *t) { // If it starts with an alphanumeric character or an underscore, search // until we hit something which isn't. int c; + long starting_pos; if (is_valid_numeric_or_id_char(init)) { + starting_pos = ftell(l->fp); for (;;) { c = getc(l->fp); // If not alphanumeric or underscore, skip to end @@ -139,6 +149,7 @@ int real_lex(Lexer *l, Token *t) { t->contents[pos] = '\0'; t->type = ttype_many_chars(t->contents); t->length = pos; + t->position_in_file = starting_pos; return 0; } @@ -189,7 +200,6 @@ int skip_to_token(Lexer *l) { fseek(l->fp, -1, SEEK_CUR); return 0; // Token was a slash without a * or / following it } - if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) { fseek(l->fp, -1, SEEK_CUR); return 0; // Token is next diff --git a/src/lexer/lex.h b/src/lexer/lex.h index 8ed0489..ad289b2 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -11,6 +11,7 @@ // the state of a lexer. typedef struct { FILE *fp; // The file we are reading from. + char current_file[TOKEN_LENGTH]; // The name of source file we are reading from. Token unlexed[TOKEN_PUTBACKS]; unsigned unlexed_count; } Lexer; diff --git a/src/lexer/token.h b/src/lexer/token.h index 7409706..2388199 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -97,7 +97,9 @@ typedef enum { #define TOKEN_LENGTH 256 typedef struct { - TokenType type; // What type of token this is. - char contents[TOKEN_LENGTH]; // The actual contents of the token. - unsigned length; // How long the token is. + TokenType type; // What type of token this is. + char contents[TOKEN_LENGTH]; // The actual contents of the token. + unsigned length; // How long the token is. + char source_file[TOKEN_LENGTH]; // The source file the token was in. + long position_in_file; // Where in the file the token started, as an index. } Token; From c49c3912028d178900e8934b34cf764432c073c0 Mon Sep 17 00:00:00 2001 From: Nico Bliss Date: Mon, 24 Jun 2024 14:03:12 -0700 Subject: [PATCH 2/6] moved to working with getc wrappers --- src/lexer/lex.c | 39 +++++++++++++++++++++++++-------------- src/lexer/lex.h | 9 +++++++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 5d3f844..83f811f 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -18,7 +18,6 @@ int in_string(char c, char s[]) { } return 0; } - // We will need to add more of these later, for sure char single_char_tokens[] = "(){}[];"; @@ -26,6 +25,19 @@ int is_valid_numeric_or_id_char(char c) { return isalnum(c) || (c == '_') || (c == '.'); } +int lexer_getchar(Lexer* l) { + l->position++; + l->buffer[0] = getc(l->fp); + return l->buffer[0]; +} + +int lexer_ungetchar(Lexer *l) { + // if called before getchar ever is this could cause problems + l->position--; + ungetc(l->buffer[0], l->fp); + return 1; +} + int real_lex(Lexer*, Token*); /** @@ -63,7 +75,7 @@ int real_lex(Lexer *l, Token *t) { skip_to_token(l); // Get initial character - int init = getc(l->fp); + int init = lexer_getchar(l); // Clear memory and initialize memset(t->contents, 0, TOKEN_LENGTH); @@ -77,7 +89,7 @@ int real_lex(Lexer *l, Token *t) { strcpy(t->contents, eof); t->length = strlen(eof); t->type = TT_EOF; - t->position_in_file = ftell(l->fp); + t->position_in_file = l->position; return 0; } @@ -94,7 +106,7 @@ int real_lex(Lexer *l, Token *t) { strcpy(t->contents, nline); t->length = strlen(nline); t->type = TT_NEWLINE; - t->position_in_file = ftell(l->fp); + t->position_in_file = l->position; return 0; } @@ -118,7 +130,7 @@ int real_lex(Lexer *l, Token *t) { if (in_string(init, single_char_tokens)) { t->length = pos; t->type = ttype_one_char(init); - t->position_in_file = ftell(l->fp); + t->position_in_file = l->position; return 0; } @@ -128,9 +140,9 @@ int real_lex(Lexer *l, Token *t) { int c; long starting_pos; if (is_valid_numeric_or_id_char(init)) { - starting_pos = ftell(l->fp); + starting_pos = l->position; for (;;) { - c = getc(l->fp); + c = lexer_getchar(l); // If not alphanumeric or underscore, skip to end if (!is_valid_numeric_or_id_char(c)) break; @@ -145,7 +157,7 @@ int real_lex(Lexer *l, Token *t) { t->contents[pos++] = c; } // We've ended! - ungetc(c, l->fp); + lexer_ungetchar(l); t->contents[pos] = '\0'; t->type = ttype_many_chars(t->contents); t->length = pos; @@ -154,7 +166,6 @@ int real_lex(Lexer *l, Token *t) { } // TODO - parse character or string literal - return 0; } @@ -175,10 +186,10 @@ int skip_to_token(Lexer *l) { int in_block = 0, pass = 0; // Read the first character - if ((cur = fgetc(l->fp)) != EOF) { + if ((cur = lexer_getchar(l)) != EOF) { prev = cur; if (!(cur == ' ' || cur == '\t' || cur == '/')) { - fseek(l->fp, -1, SEEK_CUR); + lexer_ungetchar(l); return 0; // Token begins immediately } } else { @@ -186,7 +197,7 @@ int skip_to_token(Lexer *l) { } // Read each character from the file until EOF - while ((cur = fgetc(l->fp)) != EOF) { + while ((cur = lexer_getchar(l)) != EOF) { if (cur == '/' && prev == '/' && in_block == 0) { in_block = 1; // Single line comment } else if (cur == '*' && prev == '/' && in_block == 0) { @@ -197,11 +208,11 @@ int skip_to_token(Lexer *l) { in_block = 0; // Out of comment } else if (prev == '/' && !(cur == '*' || cur == '/') && in_block == 0) { - fseek(l->fp, -1, SEEK_CUR); + lexer_ungetchar(l); return 0; // Token was a slash without a * or / following it } if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) { - fseek(l->fp, -1, SEEK_CUR); + lexer_ungetchar(l); return 0; // Token is next } diff --git a/src/lexer/lex.h b/src/lexer/lex.h index ad289b2..ea1bdd4 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -12,6 +12,8 @@ typedef struct { FILE *fp; // The file we are reading from. char current_file[TOKEN_LENGTH]; // The name of source file we are reading from. + char buffer[1]; // A buffer so that chars can be "put back" + int position; // The posistion of the file pointer in the current file in characters from the start Token unlexed[TOKEN_PUTBACKS]; unsigned unlexed_count; } Lexer; @@ -20,6 +22,13 @@ typedef struct { // with the next available token from the file. int lex(Lexer *l, Token *token); +// Wrapper for getc. Takes a lexer pointer and returns the next character in the file its holding onto. +// Updates position and stream +int lexer_getchar(Lexer *l); + +// Wrapper for ungetc. Takes a lexer pointer and back-tracks 1 character using the lexer buffer. Updates position +int lexer_ungetchar(Lexer *l); + // Put a token back to be lexed again in the future. int unlex(Lexer *l, Token *token); From 4c60de2275018ace5fde3812507aa4ed1034fb76 Mon Sep 17 00:00:00 2001 From: Nico Bliss Date: Mon, 24 Jun 2024 14:07:35 -0700 Subject: [PATCH 3/6] forgor longs not ints in position tracking --- src/lexer/lex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lexer/lex.h b/src/lexer/lex.h index ea1bdd4..591f1dc 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -13,7 +13,7 @@ typedef struct { FILE *fp; // The file we are reading from. char current_file[TOKEN_LENGTH]; // The name of source file we are reading from. char buffer[1]; // A buffer so that chars can be "put back" - int position; // The posistion of the file pointer in the current file in characters from the start + long position; // The posistion of the file pointer in the current file in characters from the start Token unlexed[TOKEN_PUTBACKS]; unsigned unlexed_count; } Lexer; From 80a6aed8f1ad1ca53852dd6d0ed7138c17cf67e8 Mon Sep 17 00:00:00 2001 From: Nico Bliss Date: Mon, 24 Jun 2024 14:16:11 -0700 Subject: [PATCH 4/6] change lexer to not fail silently --- src/lexer/lex.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 83f811f..8848c64 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -166,6 +166,8 @@ int real_lex(Lexer *l, Token *t) { } // TODO - parse character or string literal + + PRINT_ERROR("lexer unable to identify token starting with: %c", init); return 0; } From b553140696615bb62bcfddd07af206a5a98b6119 Mon Sep 17 00:00:00 2001 From: Nico Bliss Date: Tue, 25 Jun 2024 11:08:27 -0700 Subject: [PATCH 5/6] add check to avoid unspecified unget's --- src/lexer/lex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 8848c64..d2e815f 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -1,5 +1,6 @@ #include "lex.h" #include "token.h" +#include #include #include // tassert @@ -32,7 +33,7 @@ int lexer_getchar(Lexer* l) { } int lexer_ungetchar(Lexer *l) { - // if called before getchar ever is this could cause problems + assert(l->position >= 0); l->position--; ungetc(l->buffer[0], l->fp); return 1; From 67ca863095fccb50e9a2b71135245307fd0eefe0 Mon Sep 17 00:00:00 2001 From: Nico Bliss Date: Tue, 25 Jun 2024 13:53:20 -0700 Subject: [PATCH 6/6] changed to line/column tracking for the tokens --- src/lexer/lex.c | 29 +++++++++++++++++++++++------ src/lexer/lex.h | 3 +++ src/lexer/token.h | 3 ++- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index d2e815f..34e9f8d 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -28,13 +28,24 @@ int is_valid_numeric_or_id_char(char c) { int lexer_getchar(Lexer* l) { l->position++; + l->last_column = l->column; l->buffer[0] = getc(l->fp); + if (l->buffer[0] == '\n') { + l->line++; + l->column = 0; + } else { + l->column++; + } return l->buffer[0]; } int lexer_ungetchar(Lexer *l) { assert(l->position >= 0); l->position--; + l->column = l->last_column; + if (l->buffer[0] == '\n') { + l->line--; + } ungetc(l->buffer[0], l->fp); return 1; } @@ -90,7 +101,8 @@ int real_lex(Lexer *l, Token *t) { strcpy(t->contents, eof); t->length = strlen(eof); t->type = TT_EOF; - t->position_in_file = l->position; + t->line = l->line; + t->column = l->column; return 0; } @@ -107,7 +119,8 @@ int real_lex(Lexer *l, Token *t) { strcpy(t->contents, nline); t->length = strlen(nline); t->type = TT_NEWLINE; - t->position_in_file = l->position; + t->line = l->line; + t->column = l->column; return 0; } @@ -131,7 +144,8 @@ int real_lex(Lexer *l, Token *t) { if (in_string(init, single_char_tokens)) { t->length = pos; t->type = ttype_one_char(init); - t->position_in_file = l->position; + t->line = l->line; + t->column = l->column; return 0; } @@ -139,9 +153,11 @@ int real_lex(Lexer *l, Token *t) { // If it starts with an alphanumeric character or an underscore, search // until we hit something which isn't. int c; - long starting_pos; + int starting_line; + int starting_col; if (is_valid_numeric_or_id_char(init)) { - starting_pos = l->position; + starting_line = l->line; + starting_col = l->column; for (;;) { c = lexer_getchar(l); // If not alphanumeric or underscore, skip to end @@ -162,7 +178,8 @@ int real_lex(Lexer *l, Token *t) { t->contents[pos] = '\0'; t->type = ttype_many_chars(t->contents); t->length = pos; - t->position_in_file = starting_pos; + t->line = starting_line; + t->column = starting_col; return 0; } diff --git a/src/lexer/lex.h b/src/lexer/lex.h index 591f1dc..9f63a5b 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -14,6 +14,9 @@ typedef struct { char current_file[TOKEN_LENGTH]; // The name of source file we are reading from. char buffer[1]; // A buffer so that chars can be "put back" long position; // The posistion of the file pointer in the current file in characters from the start + int last_column; + int column; // The number of characters down whichever line its on + int line; // The number of lines it has passed so far Token unlexed[TOKEN_PUTBACKS]; unsigned unlexed_count; } Lexer; diff --git a/src/lexer/token.h b/src/lexer/token.h index 2388199..f42c28b 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -101,5 +101,6 @@ typedef struct { char contents[TOKEN_LENGTH]; // The actual contents of the token. unsigned length; // How long the token is. char source_file[TOKEN_LENGTH]; // The source file the token was in. - long position_in_file; // Where in the file the token started, as an index. + int line; // which line in the file the token was found + int column; // Where in that line the token was found } Token;