created base tokenzior functionality

FartLabs · Nov 24, 2021 · d1470f7 · d1470f7 · deno-deploy · Nov 24, 2021
1 parent b0540c3
commit d1470f7
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 17 deletions.
diff --git a/lib/tokenize/lexicon.ts b/lib/tokenize/lexicon.ts
@@ -14,17 +14,21 @@ export enum Lexicon {
   Comment,
   CommentOpener,
   CommentCloser,
+  Whitespace,
   Unknown,
   EOF,
 }
 
-export const LEXICON = new Map<Lexicon, string | string[] | null>([
+export const LEXICON: ReadonlyMap<Lexicon, string | string[] | null> = new Map<
+  Lexicon,
+  string | string[] | null
+>([
   [Lexicon.Identifier, null],
   [Lexicon.StructOpener, "{"],
   [Lexicon.StructCloser, "}"],
   [Lexicon.TupleOpener, "("],
   [Lexicon.TupleCloser, ")"],
-  [Lexicon.TypeDefiner, ["type", "struct", "interface"]],
+  [Lexicon.TypeDefiner, ["type", "spec"]],
   [Lexicon.PropertyDefiner, ":"],
   [Lexicon.PropertyOptionalMarker, "?"],
   [Lexicon.PropertyOptionalDefiner, "?:"],
@@ -34,17 +38,18 @@ export const LEXICON = new Map<Lexicon, string | string[] | null>([
   [Lexicon.Comment, [";", "//"]],
   [Lexicon.CommentOpener, "/*"],
   [Lexicon.CommentCloser, "*/"],
+  [Lexicon.Whitespace, " "],
   [Lexicon.Unknown, null],
-  [Lexicon.EOF, null],
+  [Lexicon.EOF, "\n"],
 ]);
 
-// freezing LEXICON map into place, courtesy of https://stackoverflow.com/a/35776333
-LEXICON.set = function (key) {
-  throw new Error("Can't add property " + key + ", map is not extensible");
+// force-freezing LEXICON map into place, courtesy of https://stackoverflow.com/a/35776333
+(LEXICON as Map<unknown, unknown>).set = function (key) {
+  throw new Error(`Can't add property ${key}, map is not extensible`);
 };
-LEXICON.delete = function (key) {
-  throw new Error("Can't delete property " + key + ", map is frozen");
+(LEXICON as Map<unknown, unknown>).delete = function (key) {
+  throw new Error(`Can't delete property ${key}, map is frozen`);
 };
-LEXICON.clear = function () {
+(LEXICON as Map<unknown, unknown>).clear = function () {
   throw new Error("Can't clear map, map is frozen");
 };
diff --git a/lib/tokenize/token.ts b/lib/tokenize/token.ts
@@ -1,4 +1,4 @@
-import { Lexicon } from "./lexicon.ts";
+import { LEXICON, Lexicon } from "./lexicon.ts";
 import {
   checkIsIdentifier,
   checkIsTextLiteral,
@@ -38,7 +38,7 @@ export class Token {
   }
 
   static getKindOf(raw: string): Lexicon {
-    const matchingKind = findInLexicon(raw);
+    const matchingKind = findInLexicon(raw, LEXICON);
     if (matchingKind !== null) return matchingKind;
     if (checkIsIdentifier(raw)) return Lexicon.Identifier;
     if (checkIsTextLiteral(raw)) return Lexicon.TextLiteral;

diff --git a/lib/tokenize/tokenize.test.ts b/lib/tokenize/tokenize.test.ts
@@ -1 +1,36 @@
-// https://github.com/EthanThatOneKid/fart/blob/main/lib/tokenize/tokenize.test.ts
+import { T } from "./t.ts";
+import { Token } from "./token.ts";
+import { tokenize } from "./tokenize.ts";
+import { assertEquals } from "../../deps/std/testing.ts";
+// import { LEXICON, Lexicon } from "./lexicon.ts";
+
+Deno.test("yields no tokens given an empty string", () => {
+  const input = "";
+  const expectation: Token[] = [];
+  const reality = [...tokenize(input)];
+  assertEquals(expectation, reality);
+});
+
+Deno.test("yields a single token `type`", () => {
+  const input = "type";
+  const expectation = [T.type(1, 1)];
+  const reality = [...tokenize(input)];
+  assertEquals(expectation, reality);
+});
+
+Deno.test("yields a full `type` definition", () => {
+  const input = `type Example {
+  testProperty: string
+}`;
+  const expectation = [
+    T.type(1, 1),
+    T.id("Example", 1, 6),
+    T.nest(1, 14),
+    T.id("testProperty", 2, 3),
+    T.setter_1(2, 15),
+    T.id("string", 2, 17),
+    T.denest(3, 1),
+  ];
+  const reality = [...tokenize(input)];
+  assertEquals(expectation, reality);
+});
diff --git a/lib/tokenize/tokenize.ts b/lib/tokenize/tokenize.ts
@@ -1 +1,109 @@
-// https://github.com/EthanThatOneKid/fart/blob/main/lib/tokenize/tokenize.ts
+import { LEXICON, Lexicon } from "./lexicon.ts";
+import { Token } from "./token.ts";
+import { findInLexicon } from "./utils.ts";
+
+/**
+ * Object used to memoize the process of properly tokenizing
+ * Fart syntax.
+ */
+interface TokenizationState {
+  char: null | string;
+  prevChar: null | string;
+  substr: string;
+  prevSubstr: string;
+  line: number;
+  column: number;
+  yieldingChar: boolean; // if true, yields character as token at end of iteration
+  yieldingSubstr: boolean; // if true, yields substring as token at end of iteration
+  breakingLine: boolean; // if true, updates line and column counts at end of iteration
+}
+
+type FartTokenGenerator = Generator<Token, undefined, undefined | Token>;
+
+const INITIAL_TOKENIZATION_STATE: Readonly<TokenizationState> = Object.freeze({
+  char: null,
+  prevChar: null,
+  substr: "",
+  prevSubstr: "",
+  line: 1,
+  column: 1,
+  yieldingChar: false,
+  yieldingSubstr: false,
+  breakingLine: false,
+});
+
+export function* tokenize(
+  input: string,
+  lex: ReadonlyMap<Lexicon, string | string[] | null> = LEXICON,
+): FartTokenGenerator {
+  const memo = { ...INITIAL_TOKENIZATION_STATE };
+
+  while (input.length > 0) {
+    memo.char = input[0];
+    memo.yieldingChar = INITIAL_TOKENIZATION_STATE.yieldingChar;
+    memo.yieldingSubstr = INITIAL_TOKENIZATION_STATE.yieldingSubstr;
+    memo.breakingLine = INITIAL_TOKENIZATION_STATE.breakingLine;
+
+    switch (findInLexicon(memo.char, lex)) {
+      // when a line break occurs, increment the line count, set column back to initial,
+      // and the current substring should become a token.
+      case Lexicon.EOF: {
+        memo.breakingLine = true;
+        memo.yieldingSubstr = true;
+        break;
+      }
+      case Lexicon.StructOpener:
+      case Lexicon.StructCloser:
+      case Lexicon.TupleOpener:
+      case Lexicon.TupleCloser:
+      case Lexicon.PropertyDefiner: {
+        memo.yieldingChar = true;
+        memo.yieldingSubstr = true;
+        break;
+      }
+      case Lexicon.PropertyOptionalMarker:
+      case Lexicon.Whitespace: {
+        memo.yieldingSubstr = true;
+        break;
+      }
+      default: {
+        memo.substr += memo.char;
+        break;
+      }
+    }
+
+    // yield and reset substring if substring is to be yielded
+    if (memo.yieldingSubstr && memo.substr.length > 0) {
+      yield new Token(memo.substr, memo.line, memo.column - memo.substr.length);
+      memo.prevSubstr = memo.substr;
+      memo.substr = INITIAL_TOKENIZATION_STATE.substr;
+    }
+
+    // if the current character is to be yielded, it must be yielded
+    // _after_ the substring.
+    if (memo.yieldingChar && memo.char !== null) {
+      yield new Token(memo.char, memo.line, memo.column);
+    }
+
+    // when a line is broken, set the column count to it's initial
+    // value and increment the line count by one.
+    if (memo.breakingLine) {
+      memo.column = INITIAL_TOKENIZATION_STATE.column - 1;
+      memo.line++;
+    }
+
+    // column count is incremented per iteration
+    memo.column++;
+
+    // current character is discarded but set as previous.
+    memo.prevChar = memo.char;
+    input = input.slice(1);
+  }
+
+  // yield substring if one is left unresolved
+  if (memo.substr.length > 0) {
+    yield new Token(memo.substr, memo.line, memo.column - memo.substr.length);
+  }
+
+  return;
+}
diff --git a/lib/tokenize/utils.ts b/lib/tokenize/utils.ts
@@ -1,8 +1,11 @@
-import { LEXICON, Lexicon } from "./lexicon.ts";
+import { Lexicon } from "./lexicon.ts";
 
-export const findInLexicon = (raw: string): Lexicon | null => {
-  for (const [kind, value] of LEXICON) {
-    if ((Array.isArray(value) && value.includes(raw) || (raw === value))) {
+export const findInLexicon = (
+  raw: string,
+  lex: ReadonlyMap<Lexicon, string | string[] | null>,
+): Lexicon | null => {
+  for (const [kind, value] of lex) {
+    if (Array.isArray(value) && value.includes(raw) || (raw === value)) {
       return kind;
     }
   }