From c3e0937bcfd040cafe10ad7b4cc95ffc56285639 Mon Sep 17 00:00:00 2001
From: Nico Bliss <nico.bliss.carrascosa@gmail.com>
Date: Mon, 24 Jun 2024 11:24:00 -0700
Subject: [PATCH 1/6] added token tracking

---
 .gitignore        |  3 ++-
 src/lexer/lex.c   | 12 +++++++++++-
 src/lexer/lex.h   |  1 +
 src/lexer/token.h |  8 +++++---
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index d163863..9a9082b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-build/
\ No newline at end of file
+build/
+shell.nix
\ No newline at end of file
diff --git a/src/lexer/lex.c b/src/lexer/lex.c
index 17795bf..5d3f844 100644
--- a/src/lexer/lex.c
+++ b/src/lexer/lex.c
@@ -1,4 +1,6 @@
 #include "lex.h"
+#include "token.h"
+#include <stdio.h>
 #include <testing/tassert.h> // tassert
 
 #include <ctype.h>
@@ -66,12 +68,16 @@ int real_lex(Lexer *l, Token *t) {
     // Clear memory and initialize
     memset(t->contents, 0, TOKEN_LENGTH);
 
+    // Set sourcefile
+    memcpy(t->source_file, &l->current_file, TOKEN_LENGTH);
+
     // First important check -- have we reached the end of the file?
     static char eof[] = "[end of file]";
     if (init == EOF) {
         strcpy(t->contents, eof);
         t->length = strlen(eof);
         t->type = TT_EOF;
+        t->position_in_file = ftell(l->fp);
         return 0;
     }
 
@@ -88,6 +94,7 @@ int real_lex(Lexer *l, Token *t) {
         strcpy(t->contents, nline);
         t->length = strlen(nline);
         t->type = TT_NEWLINE;
+        t->position_in_file = ftell(l->fp);
         return 0;
     }
 
@@ -111,6 +118,7 @@ int real_lex(Lexer *l, Token *t) {
     if (in_string(init, single_char_tokens)) {
         t->length = pos;
         t->type = ttype_one_char(init);
+        t->position_in_file = ftell(l->fp);
         return 0;
     }
 
@@ -118,7 +126,9 @@ int real_lex(Lexer *l, Token *t) {
     // If it starts with an alphanumeric character or an underscore, search
     // until we hit something which isn't.
     int c;
+    long starting_pos;
     if (is_valid_numeric_or_id_char(init)) {
+        starting_pos = ftell(l->fp);
         for (;;) {
             c = getc(l->fp);
             // If not alphanumeric or underscore, skip to end
@@ -139,6 +149,7 @@ int real_lex(Lexer *l, Token *t) {
         t->contents[pos] = '\0';
         t->type = ttype_many_chars(t->contents);
         t->length = pos;
+        t->position_in_file = starting_pos;
         return 0;
     }
 
@@ -189,7 +200,6 @@ int skip_to_token(Lexer *l) {
             fseek(l->fp, -1, SEEK_CUR);
             return 0; // Token was a slash without a * or / following it
         }
-
         if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) {
             fseek(l->fp, -1, SEEK_CUR);
             return 0; // Token is next
diff --git a/src/lexer/lex.h b/src/lexer/lex.h
index 8ed0489..ad289b2 100644
--- a/src/lexer/lex.h
+++ b/src/lexer/lex.h
@@ -11,6 +11,7 @@
 // the state of a lexer.
 typedef struct {
     FILE *fp; // The file we are reading from.
+    char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
     Token unlexed[TOKEN_PUTBACKS];
     unsigned unlexed_count;
 } Lexer;
diff --git a/src/lexer/token.h b/src/lexer/token.h
index 7409706..2388199 100644
--- a/src/lexer/token.h
+++ b/src/lexer/token.h
@@ -97,7 +97,9 @@ typedef enum {
 #define TOKEN_LENGTH 256
 
 typedef struct {
-    TokenType type;              // What type of token this is.
-    char contents[TOKEN_LENGTH]; // The actual contents of the token.
-    unsigned length;             // How long the token is.
+    TokenType type;                 // What type of token this is.
+    char contents[TOKEN_LENGTH];    // The actual contents of the token.
+    unsigned length;                // How long the token is.
+    char source_file[TOKEN_LENGTH]; // The source file the token was in.
+    long position_in_file;           // Where in the file the token started, as an index.
 } Token;

From c49c3912028d178900e8934b34cf764432c073c0 Mon Sep 17 00:00:00 2001
From: Nico Bliss <nico.bliss.carrascosa@gmail.com>
Date: Mon, 24 Jun 2024 14:03:12 -0700
Subject: [PATCH 2/6] moved to working with getc wrappers

---
 src/lexer/lex.c | 39 +++++++++++++++++++++++++--------------
 src/lexer/lex.h |  9 +++++++++
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/lexer/lex.c b/src/lexer/lex.c
index 5d3f844..83f811f 100644
--- a/src/lexer/lex.c
+++ b/src/lexer/lex.c
@@ -18,7 +18,6 @@ int in_string(char c, char s[]) {
     }
     return 0;
 }
-
 // We will need to add more of these later, for sure
 char single_char_tokens[] = "(){}[];";
 
@@ -26,6 +25,19 @@ int is_valid_numeric_or_id_char(char c) {
     return isalnum(c) || (c == '_') || (c == '.');
 }
 
+int lexer_getchar(Lexer* l) {
+    l->position++;
+    l->buffer[0] = getc(l->fp);
+    return l->buffer[0];
+}
+
+int lexer_ungetchar(Lexer *l) {
+    // if called before getchar ever is this could cause problems
+    l->position--;
+    ungetc(l->buffer[0], l->fp);
+    return 1;
+}
+
 int real_lex(Lexer*, Token*);
 
 /**
@@ -63,7 +75,7 @@ int real_lex(Lexer *l, Token *t) {
 
     skip_to_token(l);
     // Get initial character
-    int init = getc(l->fp);
+    int init = lexer_getchar(l);
 
     // Clear memory and initialize
     memset(t->contents, 0, TOKEN_LENGTH);
@@ -77,7 +89,7 @@ int real_lex(Lexer *l, Token *t) {
         strcpy(t->contents, eof);
         t->length = strlen(eof);
         t->type = TT_EOF;
-        t->position_in_file = ftell(l->fp);
+        t->position_in_file = l->position;
         return 0;
     }
 
@@ -94,7 +106,7 @@ int real_lex(Lexer *l, Token *t) {
         strcpy(t->contents, nline);
         t->length = strlen(nline);
         t->type = TT_NEWLINE;
-        t->position_in_file = ftell(l->fp);
+        t->position_in_file = l->position;
         return 0;
     }
 
@@ -118,7 +130,7 @@ int real_lex(Lexer *l, Token *t) {
     if (in_string(init, single_char_tokens)) {
         t->length = pos;
         t->type = ttype_one_char(init);
-        t->position_in_file = ftell(l->fp);
+        t->position_in_file = l->position;
         return 0;
     }
 
@@ -128,9 +140,9 @@ int real_lex(Lexer *l, Token *t) {
     int c;
     long starting_pos;
     if (is_valid_numeric_or_id_char(init)) {
-        starting_pos = ftell(l->fp);
+        starting_pos = l->position;
         for (;;) {
-            c = getc(l->fp);
+            c = lexer_getchar(l);
             // If not alphanumeric or underscore, skip to end
             if (!is_valid_numeric_or_id_char(c))
                 break;
@@ -145,7 +157,7 @@ int real_lex(Lexer *l, Token *t) {
             t->contents[pos++] = c;
         }
         // We've ended!
-        ungetc(c, l->fp);
+        lexer_ungetchar(l);
         t->contents[pos] = '\0';
         t->type = ttype_many_chars(t->contents);
         t->length = pos;
@@ -154,7 +166,6 @@ int real_lex(Lexer *l, Token *t) {
     }
 
     // TODO - parse character or string literal
-
     return 0;
 }
 
@@ -175,10 +186,10 @@ int skip_to_token(Lexer *l) {
     int in_block = 0, pass = 0;
 
     // Read the first character
-    if ((cur = fgetc(l->fp)) != EOF) {
+    if ((cur = lexer_getchar(l)) != EOF) {
         prev = cur;
         if (!(cur == ' ' || cur == '\t' || cur == '/')) {
-            fseek(l->fp, -1, SEEK_CUR);
+            lexer_ungetchar(l);
             return 0; // Token begins immediately
         }
     } else {
@@ -186,7 +197,7 @@ int skip_to_token(Lexer *l) {
     }
 
     // Read each character from the file until EOF
-    while ((cur = fgetc(l->fp)) != EOF) {
+    while ((cur = lexer_getchar(l)) != EOF) {
         if (cur == '/' && prev == '/' && in_block == 0) {
             in_block = 1; // Single line comment
         } else if (cur == '*' && prev == '/' && in_block == 0) {
@@ -197,11 +208,11 @@ int skip_to_token(Lexer *l) {
             in_block = 0; // Out of comment
         } else if (prev == '/' && !(cur == '*' || cur == '/') &&
                    in_block == 0) {
-            fseek(l->fp, -1, SEEK_CUR);
+            lexer_ungetchar(l);
             return 0; // Token was a slash without a * or / following it
         }
         if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) {
-            fseek(l->fp, -1, SEEK_CUR);
+            lexer_ungetchar(l);
             return 0; // Token is next
         }
 
diff --git a/src/lexer/lex.h b/src/lexer/lex.h
index ad289b2..ea1bdd4 100644
--- a/src/lexer/lex.h
+++ b/src/lexer/lex.h
@@ -12,6 +12,8 @@
 typedef struct {
     FILE *fp; // The file we are reading from.
     char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
+    char buffer[1]; // A buffer so that chars can be "put back"
+    int position; // The posistion of the file pointer in the current file in characters from the start
     Token unlexed[TOKEN_PUTBACKS];
     unsigned unlexed_count;
 } Lexer;
@@ -20,6 +22,13 @@ typedef struct {
 // with the next available token from the file.
 int lex(Lexer *l, Token *token);
 
+// Wrapper for getc. Takes a lexer pointer and returns the next character in the file its holding onto.
+// Updates position and stream
+int lexer_getchar(Lexer *l);
+
+// Wrapper for ungetc. Takes a lexer pointer and back-tracks 1 character using the lexer buffer. Updates position
+int lexer_ungetchar(Lexer *l);
+
 // Put a token back to be lexed again in the future.
 int unlex(Lexer *l, Token *token);
 

From 4c60de2275018ace5fde3812507aa4ed1034fb76 Mon Sep 17 00:00:00 2001
From: Nico Bliss <nico.bliss.carrascosa@gmail.com>
Date: Mon, 24 Jun 2024 14:07:35 -0700
Subject: [PATCH 3/6] forgor longs not ints in position tracking

---
 src/lexer/lex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lexer/lex.h b/src/lexer/lex.h
index ea1bdd4..591f1dc 100644
--- a/src/lexer/lex.h
+++ b/src/lexer/lex.h
@@ -13,7 +13,7 @@ typedef struct {
     FILE *fp; // The file we are reading from.
     char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
     char buffer[1]; // A buffer so that chars can be "put back"
-    int position; // The posistion of the file pointer in the current file in characters from the start
+    long position; // The posistion of the file pointer in the current file in characters from the start
     Token unlexed[TOKEN_PUTBACKS];
     unsigned unlexed_count;
 } Lexer;

From 80a6aed8f1ad1ca53852dd6d0ed7138c17cf67e8 Mon Sep 17 00:00:00 2001
From: Nico Bliss <nico.bliss.carrascosa@gmail.com>
Date: Mon, 24 Jun 2024 14:16:11 -0700
Subject: [PATCH 4/6] change lexer to not fail silently

---
 src/lexer/lex.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lexer/lex.c b/src/lexer/lex.c
index 83f811f..8848c64 100644
--- a/src/lexer/lex.c
+++ b/src/lexer/lex.c
@@ -166,6 +166,8 @@ int real_lex(Lexer *l, Token *t) {
     }
 
     // TODO - parse character or string literal
+
+    PRINT_ERROR("lexer unable to identify token starting with: %c", init);
     return 0;
 }
 

From b553140696615bb62bcfddd07af206a5a98b6119 Mon Sep 17 00:00:00 2001
From: Nico Bliss <nico.bliss.carrascosa@gmail.com>
Date: Tue, 25 Jun 2024 11:08:27 -0700
Subject: [PATCH 5/6] add check to avoid unspecified unget's

---
 src/lexer/lex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lexer/lex.c b/src/lexer/lex.c
index 8848c64..d2e815f 100644
--- a/src/lexer/lex.c
+++ b/src/lexer/lex.c
@@ -1,5 +1,6 @@
 #include "lex.h"
 #include "token.h"
+#include <assert.h>
 #include <stdio.h>
 #include <testing/tassert.h> // tassert
 
@@ -32,7 +33,7 @@ int lexer_getchar(Lexer* l) {
 }
 
 int lexer_ungetchar(Lexer *l) {
-    // if called before getchar ever is this could cause problems
+    assert(l->position >= 0);
     l->position--;
     ungetc(l->buffer[0], l->fp);
     return 1;

From 67ca863095fccb50e9a2b71135245307fd0eefe0 Mon Sep 17 00:00:00 2001
From: Nico Bliss <nico.bliss.carrascosa@gmail.com>
Date: Tue, 25 Jun 2024 13:53:20 -0700
Subject: [PATCH 6/6] changed to line/column tracking for the tokens

---
 src/lexer/lex.c   | 29 +++++++++++++++++++++++------
 src/lexer/lex.h   |  3 +++
 src/lexer/token.h |  3 ++-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/lexer/lex.c b/src/lexer/lex.c
index d2e815f..34e9f8d 100644
--- a/src/lexer/lex.c
+++ b/src/lexer/lex.c
@@ -28,13 +28,24 @@ int is_valid_numeric_or_id_char(char c) {
 
 int lexer_getchar(Lexer* l) {
     l->position++;
+    l->last_column = l->column;
     l->buffer[0] = getc(l->fp);
+    if (l->buffer[0] == '\n') {
+        l->line++;
+        l->column = 0;
+    } else {
+        l->column++;
+    }
     return l->buffer[0];
 }
 
 int lexer_ungetchar(Lexer *l) {
     assert(l->position >= 0);
     l->position--;
+    l->column = l->last_column;
+    if (l->buffer[0] == '\n') {
+        l->line--;
+    }
     ungetc(l->buffer[0], l->fp);
     return 1;
 }
@@ -90,7 +101,8 @@ int real_lex(Lexer *l, Token *t) {
         strcpy(t->contents, eof);
         t->length = strlen(eof);
         t->type = TT_EOF;
-        t->position_in_file = l->position;
+        t->line = l->line;
+        t->column = l->column;
         return 0;
     }
 
@@ -107,7 +119,8 @@ int real_lex(Lexer *l, Token *t) {
         strcpy(t->contents, nline);
         t->length = strlen(nline);
         t->type = TT_NEWLINE;
-        t->position_in_file = l->position;
+        t->line = l->line;
+        t->column = l->column;
         return 0;
     }
 
@@ -131,7 +144,8 @@ int real_lex(Lexer *l, Token *t) {
     if (in_string(init, single_char_tokens)) {
         t->length = pos;
         t->type = ttype_one_char(init);
-        t->position_in_file = l->position;
+        t->line = l->line;
+        t->column = l->column;
         return 0;
     }
 
@@ -139,9 +153,11 @@ int real_lex(Lexer *l, Token *t) {
     // If it starts with an alphanumeric character or an underscore, search
     // until we hit something which isn't.
     int c;
-    long starting_pos;
+    int starting_line;
+    int starting_col;
     if (is_valid_numeric_or_id_char(init)) {
-        starting_pos = l->position;
+        starting_line = l->line;
+        starting_col = l->column;
         for (;;) {
             c = lexer_getchar(l);
             // If not alphanumeric or underscore, skip to end
@@ -162,7 +178,8 @@ int real_lex(Lexer *l, Token *t) {
         t->contents[pos] = '\0';
         t->type = ttype_many_chars(t->contents);
         t->length = pos;
-        t->position_in_file = starting_pos;
+        t->line = starting_line;
+        t->column = starting_col;
         return 0;
     }
 
diff --git a/src/lexer/lex.h b/src/lexer/lex.h
index 591f1dc..9f63a5b 100644
--- a/src/lexer/lex.h
+++ b/src/lexer/lex.h
@@ -14,6 +14,9 @@ typedef struct {
     char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
     char buffer[1]; // A buffer so that chars can be "put back"
     long position; // The posistion of the file pointer in the current file in characters from the start
+    int last_column;
+    int column; // The number of characters down whichever line its on
+    int line; // The number of lines it has passed so far
     Token unlexed[TOKEN_PUTBACKS];
     unsigned unlexed_count;
 } Lexer;
diff --git a/src/lexer/token.h b/src/lexer/token.h
index 2388199..f42c28b 100644
--- a/src/lexer/token.h
+++ b/src/lexer/token.h
@@ -101,5 +101,6 @@ typedef struct {
     char contents[TOKEN_LENGTH];    // The actual contents of the token.
     unsigned length;                // How long the token is.
     char source_file[TOKEN_LENGTH]; // The source file the token was in.
-    long position_in_file;           // Where in the file the token started, as an index.
+    int line;                       // which line in the file the token was found
+    int column;                     // Where in that line the token was found
 } Token;