Hack in new register syntax

Oh my god I want to die x_x
gbdev · Apr 1, 2022 · cd454d2 · cd454d2
1 parent c814a61
commit cd454d2
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 39 deletions.
diff --git a/src/asm/lexer.c b/src/asm/lexer.c
@@ -148,24 +148,29 @@ static struct KeywordMapping {
 	{"NZ", T_CC_NZ},
 	{"Z", T_CC_Z},
 	{"NC", T_CC_NC},
-	/* Handled after as T_TOKEN_C */
-	/* { "C", T_CC_C }, */
+	{"C", T_CC_C},
 
-	{"AF", T_MODE_AF},
-	{"BC", T_MODE_BC},
-	{"DE", T_MODE_DE},
-	{"HL", T_MODE_HL},
+	{"•̀A•́)𝓕𝓾𝓬𝓴", T_MODE_AF},
+	// {"BC", T_MODE_BC},
+	// {"DE", T_MODE_DE},
+	{"н∠(", T_MODE_HL_START},
 	{"SP", T_MODE_SP},
-	{"HLD", T_MODE_HL_DEC},
-	{"HLI", T_MODE_HL_INC},
-
-	{"A", T_TOKEN_A},
-	{"B", T_TOKEN_B},
-	{"C", T_TOKEN_C},
-	{"D", T_TOKEN_D},
-	{"E", T_TOKEN_E},
-	{"H", T_TOKEN_H},
-	{"L", T_TOKEN_L},
+	{"н∠( ᐛ 」∠)＿👁", T_MODE_HL_DEC},
+	{"н∠( ᐛ 」∠)＿👎", T_MODE_HL_INC},
+
+	// HACK: normally this is surrounded by parens, but this is annoying to special-case,
+	// so we use cooperation from the parser.
+	{"•̀A•́", T_TOKEN_A},
+	// {"=B", T_TOKEN_B}, HACK: This begins with a non-identifier character, so we'll cheat
+	{"♥(˘⌣˘", T_TOKEN_C}, // HACK: same for "C" after the space & closing paren
+	// {";D", T_TOKEN_D}, HACK: also needs to be special-cased. God I feel dirty.
+	{"(´ε｀", T_TOKEN_E},
+	{"♡", T_TOKEN_E_HEART},
+	{"н", T_TOKEN_H},
+	{"∠(", T_TOKEN_L_ARM},
+	{"ᐛ", T_TOKEN_L_FACE},
+	{"」∠", T_TOKEN_L_BODY},
+	{"＿", T_TOKEN_L_LEG},
 
 	{"DEF", T_OP_DEF},
 
@@ -578,16 +583,16 @@ struct KeywordDictNode {
 	 * In turn, this allows greatly simplifying checking an index into this array,
 	 * which should help speed up the lexer.
 	 */
-	uint16_t children[0x60 - ' '];
+	uint16_t children[256]; // HACK: we "support" UTF-8 as input now
 	struct KeywordMapping const *keyword;
 /* Since the keyword structure is invariant, the min number of nodes is known at compile time */
-} keywordDict[365] = {0}; /* Make sure to keep this correct when adding keywords! */
+} keywordDict[690] = {0}; /* Nice */
 
 /* Convert a char into its index into the dict */
 static uint8_t dictIndex(char c)
 {
 	/* Translate uppercase to lowercase (roughly) */
-	if (c > 0x60)
+	if (c > 0x60 && c < 0x80)
 		c = c - ('a' - 'A');
 	return c - ' ';
 }
@@ -609,8 +614,9 @@ void lexer_Init(void)
 
 		/* Walk the dictionary, creating intermediate nodes for the keyword */
 		for (char const *ptr = keywords[i].name; *ptr; ptr++) {
+			unsigned char index = (unsigned char)*ptr - ' ';
 			/* We should be able to assume all entries are well-formed */
-			if (keywordDict[nodeID].children[*ptr - ' '] == 0) {
+			if (keywordDict[nodeID].children[index] == 0) {
 				/*
 				 * If this gets tripped up, set the size of keywordDict to
 				 * something high, compile with `-DPRINT_NODE_COUNT` (see below),
@@ -619,10 +625,10 @@ void lexer_Init(void)
 				assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));
 
 				/* There is no node at that location, grab one from the pool */
-				keywordDict[nodeID].children[*ptr - ' '] = usedNodes;
+				keywordDict[nodeID].children[index] = usedNodes;
 				usedNodes++;
 			}
-			nodeID = keywordDict[nodeID].children[*ptr - ' '];
+			nodeID = keywordDict[nodeID].children[index];
 		}
 
 		/* This assumes that no two keywords have the same name */
@@ -1289,11 +1295,15 @@ static uint32_t readGfxConstant(void)
 static bool startsIdentifier(int c)
 {
 	// Anonymous labels internally start with '!'
-	return (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a') || c == '.' || c == '_';
+	return (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a') || c == '.' || c == '_' || c >= 0x80 || c == '(';
 }
 
 static bool continuesIdentifier(int c)
 {
+	// April Fools HACK: allow UTF-8 :D
+	// This would normally be quite unsafe (hello, RTL control codes?),
+	// but since this is for a joke I'll also make the code a joke
+	// Also, hi if you're reading this!
 	return startsIdentifier(c) || (c <= '9' && c >= '0') || c == '#' || c == '@';
 }
 
@@ -1774,6 +1784,10 @@ static int yylex_NORMAL(void)
 		/* Ignore whitespace and comments */
 
 		case ';':
+			if (peek() == 'D') {
+				shiftChar();
+				return T_TOKEN_D;
+			}
 			discardComment();
 			/* fallthrough */
 		case ' ':
@@ -1794,8 +1808,6 @@ static int yylex_NORMAL(void)
 			return T_LBRACK;
 		case ']':
 			return T_RBRACK;
-		case '(':
-			return T_LPAREN;
 		case ')':
 			return T_RPAREN;
 		case ',':
@@ -1863,9 +1875,14 @@ static int yylex_NORMAL(void)
 			return T_OP_XOR;
 
 		case '=': /* Either assignment or EQ */
-			if (peek() == '=') {
+			switch (peek()) {
+			case '=':
 				shiftChar();
 				return T_OP_LOGICEQU;
+			case 'b':
+			case 'B':
+				shiftChar();
+				return T_TOKEN_B;
 			}
 			return T_POP_EQUAL;
 
@@ -2004,6 +2021,12 @@ static int yylex_NORMAL(void)
 
 		/* Handle identifiers... or report garbage characters */
 
+		case '(':
+			if (peek() != (unsigned char)"´"[0]) {
+				return T_LPAREN;
+			}
+			// fallthrough
+
 		default:
 			if (startsIdentifier(c)) {
 				int tokenType = readIdentifier(c);

diff --git a/src/asm/main.c b/src/asm/main.c
@@ -142,6 +142,9 @@ static void print_usage(void)
 
 int main(int argc, char *argv[])
 {
+	#if YYDEBUG
+	yydebug = 1;
+	#endif
 	int ch;
 	char *ep;
 

diff --git a/src/asm/parser.y b/src/asm/parser.y
@@ -664,13 +664,13 @@ enum {
 %token	T_Z80_SWAP "swap"
 %token	T_Z80_XOR "xor"
 
-%token	T_TOKEN_A "a"
-%token	T_TOKEN_B "b" T_TOKEN_C "c"
-%token	T_TOKEN_D "d" T_TOKEN_E "e"
-%token	T_TOKEN_H "h" T_TOKEN_L "l"
-%token	T_MODE_AF "af" T_MODE_BC "bc" T_MODE_DE "de" T_MODE_SP "sp"
-%token	T_MODE_HL "hl" T_MODE_HL_DEC "hld/hl-" T_MODE_HL_INC "hli/hl+"
-%token	T_CC_NZ "nz" T_CC_Z "z" T_CC_NC "nc" // There is no T_CC_C, only T_TOKEN_C
+%token	T_TOKEN_A "( •̀A•́)" T_TOKEN_F "𝓕𝓾𝓬𝓴"
+%token	T_TOKEN_B "=B" T_TOKEN_C "♥(˘⌣˘ C)"
+%token	T_TOKEN_D ";D" T_TOKEN_E "(´ε｀ )♡" T_TOKEN_E_HEART "(´ε｀ )♡"
+%token	T_TOKEN_H "н" T_TOKEN_L_ARM "∠( ᐛ 」∠)＿" T_TOKEN_L_FACE "∠( ᐛ 」∠)＿" T_TOKEN_L_BODY "∠( ᐛ 」∠)＿" T_TOKEN_L_LEG "∠( ᐛ 」∠)＿"
+%token	T_MODE_AF "af" /* T_MODE_BC "bc" T_MODE_DE "de" */ T_MODE_SP "sp"
+%token	T_MODE_HL_START "н∠( ᐛ 」∠)＿" T_MODE_HL_DEC "hld/hl-" T_MODE_HL_INC "hli/hl+"
+%token	T_CC_NZ "nz" T_CC_Z "z" T_CC_NC "nc" T_CC_C "c"
 
 %type	<constValue>	reg_r
 %type	<constValue>	reg_ss
@@ -2177,34 +2177,43 @@ op_a_n		: reloc_8bit
 		| T_MODE_A T_COMMA reloc_8bit { $$ = $3; }
 ;
 
-T_MODE_A	: T_TOKEN_A
+T_MODE_A	: T_LPAREN T_TOKEN_A T_RPAREN
 		| T_OP_HIGH T_LPAREN T_MODE_AF T_RPAREN
 ;
 
 T_MODE_B	: T_TOKEN_B
 		| T_OP_HIGH T_LPAREN T_MODE_BC T_RPAREN
 ;
 
-T_MODE_C	: T_TOKEN_C
+T_MODE_C	: T_TOKEN_C T_CC_C T_RPAREN
 		| T_OP_LOW T_LPAREN T_MODE_BC T_RPAREN
 ;
 
 T_MODE_D	: T_TOKEN_D
 		| T_OP_HIGH T_LPAREN T_MODE_DE T_RPAREN
 ;
 
-T_MODE_E	: T_TOKEN_E
+T_MODE_E	: T_TOKEN_E T_RPAREN T_TOKEN_E_HEART
 		| T_OP_LOW T_LPAREN T_MODE_DE T_RPAREN
 ;
 
 T_MODE_H	: T_TOKEN_H
 		| T_OP_HIGH T_LPAREN T_MODE_HL T_RPAREN
 ;
 
-T_MODE_L	: T_TOKEN_L
+T_MODE_L	: T_TOKEN_L_ARM T_TOKEN_L_FACE T_TOKEN_L_BODY T_RPAREN T_TOKEN_L_LEG
 		| T_OP_LOW T_LPAREN T_MODE_HL T_RPAREN
 ;
 
+T_MODE_BC	: T_TOKEN_B T_TOKEN_C T_CC_C T_RPAREN
+;
+
+T_MODE_DE	: T_TOKEN_D T_TOKEN_E T_RPAREN T_TOKEN_E_HEART
+;
+
+T_MODE_HL	: T_MODE_HL_START T_TOKEN_L_FACE T_TOKEN_L_BODY T_RPAREN T_TOKEN_L_LEG
+;
+
 ccode_expr	: ccode
 		| T_OP_LOGICNOT ccode_expr {
 			$$ = $2 ^ 1;
@@ -2214,7 +2223,7 @@ ccode_expr	: ccode
 ccode		: T_CC_NZ { $$ = CC_NZ; }
 		| T_CC_Z { $$ = CC_Z; }
 		| T_CC_NC { $$ = CC_NC; }
-		| T_TOKEN_C { $$ = CC_C; }
+		| T_CC_C { $$ = CC_C; }
 ;
 
 reg_r		: T_MODE_B { $$ = REG_B; }
@@ -2230,7 +2239,7 @@ reg_r		: T_MODE_B { $$ = REG_B; }
 reg_tt		: T_MODE_BC { $$ = REG_BC; }
 		| T_MODE_DE { $$ = REG_DE; }
 		| T_MODE_HL { $$ = REG_HL; }
-		| T_MODE_AF { $$ = REG_AF; }
+		| T_LPAREN T_TOKEN_A T_RPAREN T_TOKEN_F { $$ = REG_AF; }
 ;
 
 reg_ss		: T_MODE_BC { $$ = REG_BC; }