Skip to content

Commit

Permalink
Merge pull request #34 from NicoBliss/main
Browse files Browse the repository at this point in the history
add token tracking
  • Loading branch information
adamhutchings authored Jun 26, 2024
2 parents d493287 + 78486dc commit b4b8657
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 13 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
build/
build/
shell.nix
60 changes: 51 additions & 9 deletions src/lexer/lex.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "lex.h"
#include "token.h"
#include <assert.h>
#include <stdio.h>
#include <testing/tassert.h> // tassert
#include <testing/test_utils.h>

Expand Down Expand Up @@ -83,6 +86,30 @@ int is_valid_numeric_or_id_char(char c) {
return isalnum(c) || (c == '_') || (c == '.');
}

int lexer_getchar(Lexer* l) {
l->position++;
l->last_column = l->column;
l->buffer[0] = getc(l->fp);
if (l->buffer[0] == '\n') {
l->line++;
l->column = 0;
} else {
l->column++;
}
return l->buffer[0];
}

int lexer_ungetchar(Lexer *l) {
assert(l->position >= 0);
l->position--;
l->column = l->last_column;
if (l->buffer[0] == '\n') {
l->line--;
}
ungetc(l->buffer[0], l->fp);
return 1;
}

int real_lex(Lexer*, Token*);

/**
Expand Down Expand Up @@ -120,17 +147,22 @@ int real_lex(Lexer *l, Token *t) {

skip_to_token(l);
// Get initial character
int init = getc(l->fp);
int init = lexer_getchar(l);

// Clear memory and initialize
memset(t->contents, 0, TOKEN_LENGTH);

// Set sourcefile
memcpy(t->source_file, &l->current_file, TOKEN_LENGTH);

// First important check -- have we reached the end of the file?
static char eof[] = "[end of file]";
if (init == EOF) {
strcpy(t->contents, eof);
t->length = strlen(eof);
t->type = TT_EOF;
t->line = l->line;
t->column = l->column;
return 0;
}

Expand All @@ -147,6 +179,8 @@ int real_lex(Lexer *l, Token *t) {
strcpy(t->contents, nline);
t->length = strlen(nline);
t->type = TT_NEWLINE;
t->line = l->line;
t->column = l->column;
return 0;
}

Expand All @@ -170,16 +204,22 @@ int real_lex(Lexer *l, Token *t) {
if (in_string(init, single_char_tokens)) {
t->length = pos;
t->type = ttype_one_char(init);
t->line = l->line;
t->column = l->column;
return 0;
}

// LEXING NUMERIC LITERAL OR IDENTIFIER
// If it starts with an alphanumeric character or an underscore, search
// until we hit something which isn't.
int c;
int starting_line;
int starting_col;
if (is_valid_numeric_or_id_char(init)) {
starting_line = l->line;
starting_col = l->column;
for (;;) {
c = getc(l->fp);
c = lexer_getchar(l);
// If not alphanumeric or underscore, skip to end
if (!is_valid_numeric_or_id_char(c))
break;
Expand All @@ -194,10 +234,12 @@ int real_lex(Lexer *l, Token *t) {
t->contents[pos++] = c;
}
// We've ended!
ungetc(c, l->fp);
lexer_ungetchar(l);
t->contents[pos] = '\0';
t->type = ttype_many_chars(t->contents);
t->length = pos;
t->line = starting_line;
t->column = starting_col;
return 0;
}

Expand All @@ -219,6 +261,7 @@ int real_lex(Lexer *l, Token *t) {

// TODO - parse character or string literal

PRINT_ERROR("lexer unable to identify token starting with: %c", init);
return 0;
}

Expand All @@ -239,18 +282,18 @@ int skip_to_token(Lexer *l) {
int in_block = 0, pass = 0;

// Read the first character
if ((cur = fgetc(l->fp)) != EOF) {
if ((cur = lexer_getchar(l)) != EOF) {
prev = cur;
if (!(cur == ' ' || cur == '\t' || cur == '/')) {
fseek(l->fp, -1, SEEK_CUR);
lexer_ungetchar(l);
return 0; // Token begins immediately
}
} else {
return -1; // File done, no more tokens
}

// Read each character from the file until EOF
while ((cur = fgetc(l->fp)) != EOF) {
while ((cur = lexer_getchar(l)) != EOF) {
if (cur == '/' && prev == '/' && in_block == 0) {
in_block = 1; // Single line comment
} else if (cur == '*' && prev == '/' && in_block == 0) {
Expand All @@ -261,12 +304,11 @@ int skip_to_token(Lexer *l) {
in_block = 0; // Out of comment
} else if (prev == '/' && !(cur == '*' || cur == '/') &&
in_block == 0) {
fseek(l->fp, -1, SEEK_CUR);
lexer_ungetchar(l);
return 0; // Token was a slash without a * or / following it
}

if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) {
fseek(l->fp, -1, SEEK_CUR);
lexer_ungetchar(l);
return 0; // Token is next
}

Expand Down
13 changes: 13 additions & 0 deletions src/lexer/lex.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
// the state of a lexer.
typedef struct {
FILE *fp; // The file we are reading from.
char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
char buffer[1]; // A buffer so that chars can be "put back"
long position; // The posistion of the file pointer in the current file in characters from the start
int last_column;
int column; // The number of characters down whichever line its on
int line; // The number of lines it has passed so far
Token unlexed[TOKEN_PUTBACKS];
unsigned unlexed_count;
} Lexer;
Expand All @@ -19,6 +25,13 @@ typedef struct {
// with the next available token from the file.
int lex(Lexer *l, Token *token);

// Wrapper for getc. Takes a lexer pointer and returns the next character in the file its holding onto.
// Updates position and stream
int lexer_getchar(Lexer *l);

// Wrapper for ungetc. Takes a lexer pointer and back-tracks 1 character using the lexer buffer. Updates position
int lexer_ungetchar(Lexer *l);

// Put a token back to be lexed again in the future.
int unlex(Lexer *l, Token *token);

Expand Down
9 changes: 6 additions & 3 deletions src/lexer/token.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ typedef enum {
#define TOKEN_LENGTH 256

typedef struct {
TokenType type; // What type of token this is.
char contents[TOKEN_LENGTH]; // The actual contents of the token.
unsigned length; // How long the token is.
TokenType type; // What type of token this is.
char contents[TOKEN_LENGTH]; // The actual contents of the token.
unsigned length; // How long the token is.
char source_file[TOKEN_LENGTH]; // The source file the token was in.
int line; // which line in the file the token was found
int column; // Where in that line the token was found
} Token;

0 comments on commit b4b8657

Please sign in to comment.