#pragma once #include "AST.h" #include "Scanner.h" #include "Program.h" #include "Terminal.h" class Parser { Program t_program; const bool is_interactive; std::vector<Token*> tokens; //Used to store extraneous, native types HashTable<std::string, ObjectType*> uncooked_types; /* Here's how this is gonna work: The Parser is going to semi-recursively iterate over the various grammar possibilities, based on the formal grammar that we have (which is loosely based on how our grammar description formats it) */ std::vector<Scanner::OperationPrecedence> lowest_ops; const Hashtable<BinaryExpression::bOps, Scanner::OperationPrecedence> bOp_to_precedence = { {BinaryExpression::bOps::Add,Scanner::OperationPrecedence::Term}, {BinaryExpression::bOps::Subtract,Scanner::OperationPrecedence::Term}, {BinaryExpression::bOps::Multiply,Scanner::OperationPrecedence::Factor}, {BinaryExpression::bOps::Divide,Scanner::OperationPrecedence::Factor}, // {BinaryExpression::bOps::FloorDivide,Scanner::OperationPrecedence::Factor}, {BinaryExpression::bOps::Exponent,Scanner::OperationPrecedence::Power}, {BinaryExpression::bOps::Modulo,Scanner::OperationPrecedence::Factor}, // {BinaryExpression::bOps::BitwiseAnd,Scanner::OperationPrecedence::Bitwise}, {BinaryExpression::bOps::BitwiseXor,Scanner::OperationPrecedence::Bitwise}, {BinaryExpression::bOps::BitwiseOr,Scanner::OperationPrecedence::Bitwise}, // {BinaryExpression::bOps::ShiftLeft,Scanner::OperationPrecedence::Bitwise}, {BinaryExpression::bOps::ShiftRight,Scanner::OperationPrecedence::Bitwise}, // {BinaryExpression::bOps::Concatenate,Scanner::OperationPrecedence::Concat}, // {BinaryExpression::bOps::LessThan,Scanner::OperationPrecedence::Comparison}, {BinaryExpression::bOps::LessEquals,Scanner::OperationPrecedence::Comparison}, {BinaryExpression::bOps::Greater,Scanner::OperationPrecedence::Comparison}, {BinaryExpression::bOps::GreaterEquals,Scanner::OperationPrecedence::Comparison}, {BinaryExpression::bOps::Equals,Scanner::OperationPrecedence::Comparison}, {BinaryExpression::bOps::NotEquals,Scanner::OperationPrecedence::Comparison}, // {BinaryExpression::bOps::LogicalAnd,Scanner::OperationPrecedence::Logical}, {BinaryExpression::bOps::LogicalOr,Scanner::OperationPrecedence::Logical}, {BinaryExpression::bOps::LogicalXor,Scanner::OperationPrecedence::Logical} }; enum class BlockType { Unknown, // ??? Function, While, If, For, Try, Catch }; int tokenheader; // Which Token # are we on? //This whole thing is one big Turing machine, reading a roll of tokens in sequence, so having a master counter is important to the iteration. //However, most functions have their own internal headers for subiteration, sometimes updating tokenheader on exit. //This is unfortunately necessary due to the recursive and multi-stroke nature of this Parser and its grammar. BinaryExpression::bOps readbOpOneChar (char* c) { switch (c[0]) { //This Switch tries to keep the binOps in the order in which they appear in JoaoGrammar.txt. case('+')://multichar needed return BinaryExpression::bOps::Add; break; case('-')://multichar needed return BinaryExpression::bOps::Subtract; break; case('*'): return BinaryExpression::bOps::Multiply; break; case('/')://multichar return BinaryExpression::bOps::Divide; break; case('^'): return BinaryExpression::bOps::Exponent; break; case('%'): return BinaryExpression::bOps::Modulo; break; case('&')://multichar needed return BinaryExpression::bOps::BitwiseAnd; break; case('~')://multichar needed return BinaryExpression::bOps::BitwiseXor; break; case('|')://multichar needed return BinaryExpression::bOps::BitwiseOr; break; case('>')://multichar needed return BinaryExpression::bOps::Greater; break; case('<')://multichar needed return BinaryExpression::bOps::LessThan; break; default: return BinaryExpression::bOps::NoOp; } return BinaryExpression::bOps::NoOp; } BinaryExpression::bOps readbOpTwoChar(char* c) { switch (c[1]) { //This Switch tries to keep the binOps in the order in which they appear in JoaoGrammar.txt. case('/'): return BinaryExpression::bOps::FloorDivide; break; case('&')://&& return BinaryExpression::bOps::LogicalAnd; break; case('~')://~~ return BinaryExpression::bOps::LogicalXor; break; case('|')://|| return BinaryExpression::bOps::LogicalOr; break; case('>')://>> return BinaryExpression::bOps::ShiftRight; break; case('<')://<< return BinaryExpression::bOps::ShiftLeft; break; case('='): switch (c[0]) { case('>'):// >= return BinaryExpression::bOps::GreaterEquals; case('<'):// <= return BinaryExpression::bOps::LessEquals; case('!'): // != return BinaryExpression::bOps::NotEquals; case('='): // == return BinaryExpression::bOps::Equals; default: return BinaryExpression::bOps::NoOp; } break; case('.'): return BinaryExpression::bOps::Concatenate; default: return BinaryExpression::bOps::NoOp; } return BinaryExpression::bOps::NoOp; } AssignmentStatement::aOps symbol_to_aOp(SymbolToken* st) { char* c = st->get_symbol(); if (st->len == 1) { if (c[0] == '=') { return AssignmentStatement::aOps::Assign; } else { return AssignmentStatement::aOps::NoOp; } } else { if (c[1] != '=') return AssignmentStatement::aOps::NoOp; switch (c[0]) { case('+'): return AssignmentStatement::aOps::AssignAdd; case('-'): return AssignmentStatement::aOps::AssignSubtract; case('*'): return AssignmentStatement::aOps::AssignMultiply; case('/'): return AssignmentStatement::aOps::AssignDivide; default: return AssignmentStatement::aOps::NoOp; } } return AssignmentStatement::aOps::NoOp; } UnaryExpression::uOps symbol_to_uOp(SymbolToken* st) { if (st->len != 1) return UnaryExpression::uOps::NoOp; switch (st->get_symbol()[0]) { case('!'): return UnaryExpression::uOps::Not; case('-'): return UnaryExpression::uOps::Negate; case('#'): return UnaryExpression::uOps::Length; case('~'): return UnaryExpression::uOps::BitwiseNot; default: return UnaryExpression::uOps::NoOp; } } ASTNode* readlvalue(int,int); ASTNode* readUnary(int, int); ASTNode* readPower(int, int); ASTNode* readBinExp(Scanner::OperationPrecedence,int,int); ASTNode* readExp(int,int); //Here-there-update; does not update if no last bracket found std::vector<Expression*> readBlock(BlockType, int, int); //This one is a little weird: It's a where-there-update. FIXME: Make it not >:( Expression* readStatement(BlockType, int&, int); //Here-there-update; does not update if no last bracket found std::vector<LocalAssignmentStatement*> readClassDef(int, int); //Reads a strongly-expected LocalAssignment (of form, "Value x = 3" or whatever). Does not consume a semicolon. LocalAssignmentStatement* readLocalAssignment(int, int); AssignmentStatement::aOps readaOp(int here = 0, bool loud = true) { #ifdef LOUD_TOKENHEADER std::cout << "readaOp starting at " << std::to_string(tokenheader) << std::endl; #endif Token* t; if (here) t = tokens[here]; else t = tokens[tokenheader]; if (t->class_enum() != Token::cEnum::SymbolToken) { if(loud) ParserError(t, "Unexpected Token when aOp was expected!"); return AssignmentStatement::aOps::NoOp; } AssignmentStatement::aOps auhp = symbol_to_aOp(static_cast<SymbolToken*>(t)); if (auhp == AssignmentStatement::aOps::NoOp) ParserError(t, "Unexpected token when assignment operator was expected!"); if(!here) ++tokenheader; #ifdef LOUD_TOKENHEADER std::cout << "readaOp setting tokenheader to " << std::to_string(tokenheader) << std::endl; #endif return auhp; } BinaryExpression::bOps readbOp(SymbolToken* st) { char* symbol = st->get_symbol(); if (st->len == 1) return readbOpOneChar(symbol); else return readbOpTwoChar(symbol); return BinaryExpression::bOps::NoOp; } //Attempts to find a func access. All instances of it are optional; so just quietly returns its var_access arg upon failure. //Here-there-update; here is the token directly following var_access //func_access ::= (property | element)['(' explist ')'][func_access] void readFuncAccess(ASTNode*& var_access, int here, int there) { int where = here; for (; where <= there; ++where) // This is all here to handle repetitive Member and Index accesses. //TODO: Make this for-loop a discrete function since it's called elsewhere as well { Token* propeller = tokens[where]; // PROPerty or ELLERment. I guess. Shut up. switch (propeller->class_enum()) { case(Token::cEnum::MemberToken): { ++where; //return new MemberAccess(scoped_access, readVarAccess(tokenheader, there)); // Doing just this would end up being right-associative, which for our purposes would be annoying to deal with interpreter-side. //So we're going to do something else. if (propeller->class_enum() != Token::cEnum::WordToken) { ParserError(propeller, "Unexpected Token when reading MemberAccess!"); } var_access = new MemberAccess(var_access, new Identifier(static_cast<WordToken*>(tokens[where])->word)); continue; } case(Token::cEnum::PairSymbolToken): { PairSymbolToken pst = *static_cast<PairSymbolToken*>(propeller); if (pst.t_pOp == PairSymbolToken::pairOp::Bracket) { int yonder = find_closing_pairlet(PairSymbolToken::pairOp::Bracket, where + 1); var_access = new IndexAccess(var_access, readExp(where + 1, yonder - 1)); where = yonder; continue; } //If it's a parenthesis then we're probably about to do a function call but, that's none of *our* business in readVarAccess() so just return tokenheader = where; return; } default: // I dunno what this is, just return what you have and hope the higher stacks know what it is tokenheader = where; return; } } //A fluke of having the init statement be above-scope of the for-loop like this is that //it actually increment once *past* what the condition is. //like if the condition were "where < 5" then it'd be 5 here. //So setting the tokenheader to where is valid here; that is genuinely the next token past what we parsed //since we *ought* to be in the case that we genuinely consumed all the tokens from here to there. tokenheader = where; return; } // VAR_ACCESS ::= // scoped_access | var_access property | var_access element // scoped_access ::= './' Name | {'../'}'../' Name | '/' Name | Name // property ::= {'.' Name } // element ::= {'[' exp ']'} // // HERE-THERE-UPDATE! ASTNode* readVarAccess( int here, int there) { #ifdef LOUD_TOKENHEADER std::cout << "readVarAccess starting at " << std::to_string(here) << std::endl; #endif ASTNode* scoped_access = nullptr; Token* t = tokens[here]; switch (t->class_enum()) { case(Token::cEnum::DirectoryToken): if (static_cast<DirectoryToken*>(t)->dir != "/" + Directory::lastword(static_cast<DirectoryToken*>(t)->dir)) ParserError(t, "Unexpected directory when variable access expected!"); scoped_access = new GlobalAccess(Directory::lastword(static_cast<DirectoryToken*>(t)->dir)); tokenheader = here + 1; break; case(Token::cEnum::WordToken): // Name scoped_access = new Identifier(static_cast<WordToken*>(t)->word); tokenheader = here + 1; break; case(Token::cEnum::ParentToken): { if (there == here || tokens[here + 1]->class_enum() != Token::cEnum::WordToken) { //ParserError(t, "ParentToken found with no corresponding Name!"); scoped_access = new ParentGet(); tokenheader = here + 1; } else { scoped_access = new ParentAccess(static_cast<WordToken*>(tokens[here + 1])->word); tokenheader = here + 2; } break; } case(Token::cEnum::GrandparentToken): { int depth = 1; for(int where = here +1; where <= there; ++where) { Token* tuk = tokens[where]; // KARH EN TUK if (tuk->class_enum() == Token::cEnum::GrandparentToken) { ++depth; } else if (tuk->class_enum() == Token::cEnum::WordToken) { scoped_access = new GrandparentAccess(depth, static_cast<WordToken*>(tuk)->word); tokenheader = where + 1; break; } else { ParserError(tuk, "Unexpected Token while reading GrandparentAccess!"); } } if(!scoped_access) ParserError(t, "GrandparentToken found with no corresponding Name!"); break; } default: ParserError(t, "Unexpected Token while reading scoped_access in readVarAccess()!"); } if (tokenheader+1 > there) // If we can't access at least 2 more tokens return scoped_access; // Just return the scoped_access find. //Now lets check for property or element int where = tokenheader; for(; where <= there; ++where) // This is all here to handle repetitive Member and Index accesses. { Token* propeller = tokens.at(where); // PROPerty or ELLERment. I guess. Shut up. switch (propeller->class_enum()) { case(Token::cEnum::MemberToken): { ++where; //return new MemberAccess(scoped_access, readVarAccess(tokenheader, there)); // Doing just this would end up being right-associative, which for our purposes would be annoying to deal with interpreter-side. //So we're going to do something else. if(tokens.size() <= static_cast<size_t>(where)) UNLIKELY // why you gotta be daft like this mang { ParserError(tokens[where-1], "Unexpected EOF when reading MemberAccess!"); return scoped_access; // I guess idfk } if (tokens[where]->class_enum() != Token::cEnum::WordToken) { ParserError(tokens[where], "Unexpected Token when reading MemberAccess!"); } scoped_access = new MemberAccess(scoped_access, new Identifier(static_cast<WordToken*>(tokens[where])->word)); continue; } case(Token::cEnum::PairSymbolToken): { PairSymbolToken pst = *static_cast<PairSymbolToken*>(propeller); if (pst.t_pOp == PairSymbolToken::pairOp::Bracket) { int yonder = find_closing_pairlet(PairSymbolToken::pairOp::Bracket, where + 1); scoped_access = new IndexAccess(scoped_access, readExp(where + 1, yonder - 1)); where = yonder; continue; } //If it's a parenthesis then we're probably about to do a function call but, that's none of *our* business in readVarAccess() so just return --where; // Make sure that hypothetical paren handler can actually see the paren goto ACCESS_END; } default: // I dunno what this is, just return what you have and hope the higher stacks know what it is --where; goto ACCESS_END; } } ACCESS_END: if (where > there) // FIXME: Stop this from happening tokenheader = there + 1; else tokenheader = where + 1; return scoped_access; } /* VARSTAT: there's four ways of doing varstats: in the Local scope, Object Scope, or Global Scope. This block implements the last three. Value x = 3; ## Set local variable to 3 /x = 3; ## Set global variable to 3 ./x = 3; ## Set property of object we're in called x to 3 x = 3; ## Ambiguous, sets lowest-scoped x available to 3 */ //Updates tokenheader via readExp(). AssignmentStatement* readAssignmentStatement(int here, int there) { ASTNode* id = readVarAccess(here, there); AssignmentStatement::aOps aesop = readaOp(); ASTNode* rvalue = readExp(tokenheader, there); return new AssignmentStatement(id, rvalue, aesop); } //A helper function for readArgs() that shouldn't really be called by anyone else int find_comma(int here, int there) { for (int where = here; where <= there; ++where) { switch (tokens[where]->class_enum()) { case(Token::cEnum::CommaToken): return where; case(Token::cEnum::PairSymbolToken): // This is function being called within this function call, I guess? Or perhaps a tableconstructor being used as a parameter. //Regardless, we need to skip over it, as it may contain comma tokens that aren't for us. where = find_closing_pairlet(static_cast<PairSymbolToken*>(tokens[where])->t_pOp, where + 1); break; default: continue; } } return 0; // Safe because it is impossible for the first token of a valid Jo�o program to be a CommaToken } //Here-there-update; //assumes here < there std::vector<ASTNode*> readArgs(int here, int there) { #ifdef LOUD_TOKENHEADER std::cout << "readVarAccess starting at " << std::to_string(here) << std::endl; #endif int comma = find_comma(here, there); if (!comma) // If comman't return { readExp(here,there) }; //exp {, exp} std::vector<ASTNode*> args{ readExp(here,comma - 1) }; tokenheader = here; while (comma) { int newcomma = find_comma(comma + 1, there); int yonder = (newcomma ? newcomma - 1 : there); args.push_back(readExp(comma + 1, yonder)); comma = newcomma; } return args; } //Updates tokenheader to be one token ahead of $here. ImmutableString readName(int here) { #ifdef LOUD_TOKENHEADER std::cout << "readName starting at " << std::to_string(here) << std::endl; #endif Token* t = tokens[here]; if (t->class_enum() != Token::cEnum::WordToken) { ParserError(t, "Unexpected token found when WordToken was expected for Name!"); } tokenheader = here + 1; #ifdef LOUD_TOKENHEADER std::cout << "readName setting tokenheader to " << std::to_string(tokenheader) << std::endl; #endif return static_cast<WordToken*>(t)->word; } // If no args, assumes tokenheader is pointing where it should. Increments tokenheader when given no args. void consume_semicolon(Token* t = nullptr) { if (!t) { t = tokens[tokenheader]; ++tokenheader; } switch (t->class_enum()) // This could be an if-statement but I've been switching like this everywhere to the point it feels like poor style to not do it here { case(Token::cEnum::EndLineToken): return; default: ParserError(t, "Unexpected Token while attempting to consume EndLineToken!"); } } void consume_paren(bool open, Token* t = nullptr) { if (!t) { t = tokens[tokenheader]; ++tokenheader; } switch (t->class_enum()) // This could be an if-statement but I've been switching like this everywhere to the point it feels like poor style to not do it here { case(Token::cEnum::PairSymbolToken): { PairSymbolToken pst = *static_cast<PairSymbolToken*>(t); if (pst.is_start != open || pst.t_pOp != PairSymbolToken::pairOp::Paren) { if(open) ParserError(t, "Unexpected PairSymbolToken while attempting to consume '('!"); else ParserError(t, "Unexpected PairSymbolToken while attempting to consume ')'!"); } return; } default: if (open) ParserError(t, "Unexpected Token while attempting to consume '('!"); else ParserError(t, "Unexpected Token while attempting to consume ')'!"); } } void consume_open_brace(int here) { Token* t = tokens[here]; if (t->class_enum() != Token::cEnum::PairSymbolToken) { ParserError(t, "Unexpected token where open-brace was expected!"); } PairSymbolToken pt = *static_cast<PairSymbolToken*>(t); if (!pt.is_start || pt.t_pOp != PairSymbolToken::pairOp::Brace) { ParserError(t, "Unexpected token where open-brace was expected!"); } } //Parsetimes if it cannot find the semicolon. size_t get_first_semicolon(int here, int there) { for (int where = here; where <= there; ++where) { Token* t = tokens[where]; if (t->class_enum() == Token::cEnum::EndLineToken) return where; } ParserError(tokens[here], "Failed to find expected semicolon!"); return tokens.size() - 1; } //Does not parsetime; used for lookahead. size_t find_first_semicolon(int here, int there) { for (int where = here; where <= there; ++where) { Token* t = tokens[where]; if (t->class_enum() == Token::cEnum::EndLineToken) return where; } return 0; // Actually works since it is impossible for the first token to be a semicolon in a valid way, weirdly enough } //Does not update tokenheader, and so is a "wanderer" sorta function. Finds the closing pairlet token of the type desired. int find_closing_pairlet(PairSymbolToken::pairOp pop, int here) { int count = 1; for (size_t where = here; where < tokens.size(); ++where) { Token* t = tokens[where]; if (t->class_enum() != Token::cEnum::PairSymbolToken) continue; PairSymbolToken* pst = static_cast<PairSymbolToken*>(t); if (pst->t_pOp != pop) continue; if (pst->is_start) // We'll also now need to find the closer for this one { ++count; continue; } --count; if (!count) { //std::cout << "I return " << std::to_string(where); return static_cast<int>(where); } } ParserError(tokens[static_cast<size_t>(here)-1], "Unable to find closing pairlet for this open pairlet!"); return 0; } int find_aOp(int here, int there) { for (int where = here; where <= there; ++where) { Token* t = tokens[where]; if (t->class_enum() != Token::cEnum::SymbolToken) continue; if (symbol_to_aOp(static_cast<SymbolToken*>(t)) != AssignmentStatement::aOps::NoOp) { return where; } } return 0; } protected: //Does the nitty-gritty of filling out the function and objecttype tables. void generate_object_tree(std::vector<ClassDefinition*>&); public: // Parser doesn't have much of an API but it does have something Parser(Scanner&t) :is_interactive(t.is_interactive) ,tokens(std::move(t.tokens)) // This steals all the tokens from Scanner. Now, we are the ones responsible for deleting all these token pointers later on. ,lowest_ops(std::move(t.lowest_ops)) { } ~Parser() { for(Token* t_ptr : tokens) { delete t_ptr; } } void ParserError(Token* t, std::string what) { //This is just a basic setup while everything else is fleshed out. Terminal::SetColor(std::cerr, Terminal::Color::Red); Terminal::SetBold(std::cerr, true); std::cerr << "Parser Error: "; Terminal::ClearFormatting(std::cerr); std::cerr << what << "\n"; if (t) std::cerr << t->dump(); std::cerr << std::endl; // This is an emscripten thing. We need to make sure this is flushed. if(is_interactive) { t_program.is_malformed = true; throw error::parser(what); } #ifdef JOAO_SAFE throw error::parser(what); #else exit(1); #endif } Program parse(); //Used for interactive mode and other similar contexts. Function* try_parse_function(); bool is_statement(); ASTNode* parse_repl_expression(Program&); //Allows outside programs to include extra "native" types. void IncludeAlienType(ObjectType* ot); };