diff --git a/lib/tokenize/lexicon.ts b/lib/tokenize/lexicon.ts index 92bab70..d96efe5 100644 --- a/lib/tokenize/lexicon.ts +++ b/lib/tokenize/lexicon.ts @@ -14,17 +14,21 @@ export enum Lexicon { Comment, CommentOpener, CommentCloser, + Whitespace, Unknown, EOF, } -export const LEXICON = new Map([ +export const LEXICON: ReadonlyMap = new Map< + Lexicon, + string | string[] | null +>([ [Lexicon.Identifier, null], [Lexicon.StructOpener, "{"], [Lexicon.StructCloser, "}"], [Lexicon.TupleOpener, "("], [Lexicon.TupleCloser, ")"], - [Lexicon.TypeDefiner, ["type", "struct", "interface"]], + [Lexicon.TypeDefiner, ["type", "spec"]], [Lexicon.PropertyDefiner, ":"], [Lexicon.PropertyOptionalMarker, "?"], [Lexicon.PropertyOptionalDefiner, "?:"], @@ -34,17 +38,18 @@ export const LEXICON = new Map([ [Lexicon.Comment, [";", "//"]], [Lexicon.CommentOpener, "/*"], [Lexicon.CommentCloser, "*/"], + [Lexicon.Whitespace, " "], [Lexicon.Unknown, null], - [Lexicon.EOF, null], + [Lexicon.EOF, "\n"], ]); -// freezing LEXICON map into place, courtesy of https://stackoverflow.com/a/35776333 -LEXICON.set = function (key) { - throw new Error("Can't add property " + key + ", map is not extensible"); +// force-freezing LEXICON map into place, courtesy of https://stackoverflow.com/a/35776333 +(LEXICON as Map).set = function (key) { + throw new Error(`Can't add property ${key}, map is not extensible`); }; -LEXICON.delete = function (key) { - throw new Error("Can't delete property " + key + ", map is frozen"); +(LEXICON as Map).delete = function (key) { + throw new Error(`Can't delete property ${key}, map is frozen`); }; -LEXICON.clear = function () { +(LEXICON as Map).clear = function () { throw new Error("Can't clear map, map is frozen"); }; diff --git a/lib/tokenize/token.ts b/lib/tokenize/token.ts index 3145932..9db3210 100644 --- a/lib/tokenize/token.ts +++ b/lib/tokenize/token.ts @@ -1,4 +1,4 @@ -import { Lexicon } from "./lexicon.ts"; +import { LEXICON, Lexicon } from "./lexicon.ts"; import { checkIsIdentifier, checkIsTextLiteral, @@ -38,7 +38,7 @@ export class Token { } static getKindOf(raw: string): Lexicon { - const matchingKind = findInLexicon(raw); + const matchingKind = findInLexicon(raw, LEXICON); if (matchingKind !== null) return matchingKind; if (checkIsIdentifier(raw)) return Lexicon.Identifier; if (checkIsTextLiteral(raw)) return Lexicon.TextLiteral; diff --git a/lib/tokenize/tokenize.test.ts b/lib/tokenize/tokenize.test.ts index 951ca4b..5906522 100644 --- a/lib/tokenize/tokenize.test.ts +++ b/lib/tokenize/tokenize.test.ts @@ -1 +1,36 @@ -// https://github.com/EthanThatOneKid/fart/blob/main/lib/tokenize/tokenize.test.ts +import { T } from "./t.ts"; +import { Token } from "./token.ts"; +import { tokenize } from "./tokenize.ts"; +import { assertEquals } from "../../deps/std/testing.ts"; +// import { LEXICON, Lexicon } from "./lexicon.ts"; + +Deno.test("yields no tokens given an empty string", () => { + const input = ""; + const expectation: Token[] = []; + const reality = [...tokenize(input)]; + assertEquals(expectation, reality); +}); + +Deno.test("yields a single token `type`", () => { + const input = "type"; + const expectation = [T.type(1, 1)]; + const reality = [...tokenize(input)]; + assertEquals(expectation, reality); +}); + +Deno.test("yields a full `type` definition", () => { + const input = `type Example { + testProperty: string +}`; + const expectation = [ + T.type(1, 1), + T.id("Example", 1, 6), + T.nest(1, 14), + T.id("testProperty", 2, 3), + T.setter_1(2, 15), + T.id("string", 2, 17), + T.denest(3, 1), + ]; + const reality = [...tokenize(input)]; + assertEquals(expectation, reality); +}); diff --git a/lib/tokenize/tokenize.ts b/lib/tokenize/tokenize.ts index 8c2a4d8..f55741f 100644 --- a/lib/tokenize/tokenize.ts +++ b/lib/tokenize/tokenize.ts @@ -1 +1,109 @@ -// https://github.com/EthanThatOneKid/fart/blob/main/lib/tokenize/tokenize.ts +import { LEXICON, Lexicon } from "./lexicon.ts"; +import { Token } from "./token.ts"; +import { findInLexicon } from "./utils.ts"; + +/** + * Object used to memoize the process of properly tokenizing + * Fart syntax. + */ +interface TokenizationState { + char: null | string; + prevChar: null | string; + substr: string; + prevSubstr: string; + line: number; + column: number; + yieldingChar: boolean; // if true, yields character as token at end of iteration + yieldingSubstr: boolean; // if true, yields substring as token at end of iteration + breakingLine: boolean; // if true, updates line and column counts at end of iteration +} + +type FartTokenGenerator = Generator; + +const INITIAL_TOKENIZATION_STATE: Readonly = Object.freeze({ + char: null, + prevChar: null, + substr: "", + prevSubstr: "", + line: 1, + column: 1, + yieldingChar: false, + yieldingSubstr: false, + breakingLine: false, +}); + +export function* tokenize( + input: string, + lex: ReadonlyMap = LEXICON, +): FartTokenGenerator { + const memo = { ...INITIAL_TOKENIZATION_STATE }; + + while (input.length > 0) { + memo.char = input[0]; + memo.yieldingChar = INITIAL_TOKENIZATION_STATE.yieldingChar; + memo.yieldingSubstr = INITIAL_TOKENIZATION_STATE.yieldingSubstr; + memo.breakingLine = INITIAL_TOKENIZATION_STATE.breakingLine; + + switch (findInLexicon(memo.char, lex)) { + // when a line break occurs, increment the line count, set column back to initial, + // and the current substring should become a token. + case Lexicon.EOF: { + memo.breakingLine = true; + memo.yieldingSubstr = true; + break; + } + case Lexicon.StructOpener: + case Lexicon.StructCloser: + case Lexicon.TupleOpener: + case Lexicon.TupleCloser: + case Lexicon.PropertyDefiner: { + memo.yieldingChar = true; + memo.yieldingSubstr = true; + break; + } + case Lexicon.PropertyOptionalMarker: + case Lexicon.Whitespace: { + memo.yieldingSubstr = true; + break; + } + default: { + memo.substr += memo.char; + break; + } + } + + // yield and reset substring if substring is to be yielded + if (memo.yieldingSubstr && memo.substr.length > 0) { + yield new Token(memo.substr, memo.line, memo.column - memo.substr.length); + memo.prevSubstr = memo.substr; + memo.substr = INITIAL_TOKENIZATION_STATE.substr; + } + + // if the current character is to be yielded, it must be yielded + // _after_ the substring. + if (memo.yieldingChar && memo.char !== null) { + yield new Token(memo.char, memo.line, memo.column); + } + + // when a line is broken, set the column count to it's initial + // value and increment the line count by one. + if (memo.breakingLine) { + memo.column = INITIAL_TOKENIZATION_STATE.column - 1; + memo.line++; + } + + // column count is incremented per iteration + memo.column++; + + // current character is discarded but set as previous. + memo.prevChar = memo.char; + input = input.slice(1); + } + + // yield substring if one is left unresolved + if (memo.substr.length > 0) { + yield new Token(memo.substr, memo.line, memo.column - memo.substr.length); + } + + return; +} diff --git a/lib/tokenize/utils.ts b/lib/tokenize/utils.ts index 00eeadd..0a019a0 100644 --- a/lib/tokenize/utils.ts +++ b/lib/tokenize/utils.ts @@ -1,8 +1,11 @@ -import { LEXICON, Lexicon } from "./lexicon.ts"; +import { Lexicon } from "./lexicon.ts"; -export const findInLexicon = (raw: string): Lexicon | null => { - for (const [kind, value] of LEXICON) { - if ((Array.isArray(value) && value.includes(raw) || (raw === value))) { +export const findInLexicon = ( + raw: string, + lex: ReadonlyMap, +): Lexicon | null => { + for (const [kind, value] of lex) { + if (Array.isArray(value) && value.includes(raw) || (raw === value)) { return kind; } }