From 22bf241bbe260c40c99c7a5b8807737af44a63c4 Mon Sep 17 00:00:00 2001 From: Mathias Rieder Date: Wed, 29 Dec 2021 12:24:40 +0000 Subject: [PATCH 1/2] added architecture documentation --- book/book.toml | 2 +- book/highlight.js | 2564 +++++++++++++++++ book/llvm.min.js | 19 + book/src/SUMMARY.md | 6 + book/src/arch/architecture.md | 39 + book/src/arch/codegen.md | 146 + book/src/arch/indexer.md | 108 + book/src/arch/linker.md | 287 ++ book/src/arch/parser.md | 91 + book/src/arch/validation.md | 19 + src/codegen/tests/code_gen_tests.rs | 53 +- ...n_tests__function_block_instance_call.snap | 39 + ...es_generates_void_function_and_struct.snap | 19 + ...code_gen_tests__structs_are_generated.snap | 2 +- ...sts__string_tests__simple_string_test.snap | 11 + src/codegen/tests/string_tests.rs | 14 + .../tests/resolve_expressions_tests.rs | 2 +- 17 files changed, 3371 insertions(+), 50 deletions(-) create mode 100644 book/highlight.js create mode 100644 book/llvm.min.js create mode 100644 book/src/arch/architecture.md create mode 100644 book/src/arch/codegen.md create mode 100644 book/src/arch/indexer.md create mode 100644 book/src/arch/linker.md create mode 100644 book/src/arch/parser.md create mode 100644 book/src/arch/validation.md create mode 100644 src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__function_block_instance_call.snap create mode 100644 src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__program_with_variables_generates_void_function_and_struct.snap create mode 100644 src/codegen/tests/snapshots/rusty__codegen__tests__string_tests__simple_string_test.snap diff --git a/book/book.toml b/book/book.toml index a1d8a3025a..f5e9df188c 100644 --- a/book/book.toml +++ b/book/book.toml @@ -5,4 +5,4 @@ multilingual = false src = "src" title = "RuSTy User Documentation" [output.html] -additional-js = ["iecst.min.js", "custom.js"] +additional-js = ["iecst.min.js", "llvm.min.js", "custom.js", "highlight.js"] diff --git a/book/highlight.js b/book/highlight.js new file mode 100644 index 0000000000..0cd0445db1 --- /dev/null +++ b/book/highlight.js @@ -0,0 +1,2564 @@ +/*! + Highlight.js v11.3.1 (git: 2a972d8658) + (c) 2006-2021 Ivan Sagalaev and other contributors + License: BSD-3-Clause + */ +var hljs = (function () { + 'use strict'; + + var deepFreezeEs6 = {exports: {}}; + + function deepFreeze(obj) { + if (obj instanceof Map) { + obj.clear = obj.delete = obj.set = function () { + throw new Error('map is read-only'); + }; + } else if (obj instanceof Set) { + obj.add = obj.clear = obj.delete = function () { + throw new Error('set is read-only'); + }; + } + + // Freeze self + Object.freeze(obj); + + Object.getOwnPropertyNames(obj).forEach(function (name) { + var prop = obj[name]; + + // Freeze prop if it is an object + if (typeof prop == 'object' && !Object.isFrozen(prop)) { + deepFreeze(prop); + } + }); + + return obj; + } + + deepFreezeEs6.exports = deepFreeze; + deepFreezeEs6.exports.default = deepFreeze; + + var deepFreeze$1 = deepFreezeEs6.exports; + + /** @typedef {import('highlight.js').CallbackResponse} CallbackResponse */ + /** @typedef {import('highlight.js').CompiledMode} CompiledMode */ + /** @implements CallbackResponse */ + + class Response { + /** + * @param {CompiledMode} mode + */ + constructor(mode) { + // eslint-disable-next-line no-undefined + if (mode.data === undefined) mode.data = {}; + + this.data = mode.data; + this.isMatchIgnored = false; + } + + ignoreMatch() { + this.isMatchIgnored = true; + } + } + + /** + * @param {string} value + * @returns {string} + */ + function escapeHTML(value) { + return value + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + + /** + * performs a shallow merge of multiple objects into one + * + * @template T + * @param {T} original + * @param {Record[]} objects + * @returns {T} a single new object + */ + function inherit$1(original, ...objects) { + /** @type Record */ + const result = Object.create(null); + + for (const key in original) { + result[key] = original[key]; + } + objects.forEach(function(obj) { + for (const key in obj) { + result[key] = obj[key]; + } + }); + return /** @type {T} */ (result); + } + + /** + * @typedef {object} Renderer + * @property {(text: string) => void} addText + * @property {(node: Node) => void} openNode + * @property {(node: Node) => void} closeNode + * @property {() => string} value + */ + + /** @typedef {{kind?: string, sublanguage?: boolean}} Node */ + /** @typedef {{walk: (r: Renderer) => void}} Tree */ + /** */ + + const SPAN_CLOSE = ''; + + /** + * Determines if a node needs to be wrapped in + * + * @param {Node} node */ + const emitsWrappingTags = (node) => { + return !!node.kind; + }; + + /** + * + * @param {string} name + * @param {{prefix:string}} options + */ + const expandScopeName = (name, { prefix }) => { + if (name.includes(".")) { + const pieces = name.split("."); + return [ + `${prefix}${pieces.shift()}`, + ...(pieces.map((x, i) => `${x}${"_".repeat(i + 1)}`)) + ].join(" "); + } + return `${prefix}${name}`; + }; + + /** @type {Renderer} */ + class HTMLRenderer { + /** + * Creates a new HTMLRenderer + * + * @param {Tree} parseTree - the parse tree (must support `walk` API) + * @param {{classPrefix: string}} options + */ + constructor(parseTree, options) { + this.buffer = ""; + this.classPrefix = options.classPrefix; + parseTree.walk(this); + } + + /** + * Adds texts to the output stream + * + * @param {string} text */ + addText(text) { + this.buffer += escapeHTML(text); + } + + /** + * Adds a node open to the output stream (if needed) + * + * @param {Node} node */ + openNode(node) { + if (!emitsWrappingTags(node)) return; + + let scope = node.kind; + if (node.sublanguage) { + scope = `language-${scope}`; + } else { + scope = expandScopeName(scope, { prefix: this.classPrefix }); + } + this.span(scope); + } + + /** + * Adds a node close to the output stream (if needed) + * + * @param {Node} node */ + closeNode(node) { + if (!emitsWrappingTags(node)) return; + + this.buffer += SPAN_CLOSE; + } + + /** + * returns the accumulated buffer + */ + value() { + return this.buffer; + } + + // helpers + + /** + * Builds a span element + * + * @param {string} className */ + span(className) { + this.buffer += ``; + } + } + + /** @typedef {{kind?: string, sublanguage?: boolean, children: Node[]} | string} Node */ + /** @typedef {{kind?: string, sublanguage?: boolean, children: Node[]} } DataNode */ + /** @typedef {import('highlight.js').Emitter} Emitter */ + /** */ + + class TokenTree { + constructor() { + /** @type DataNode */ + this.rootNode = { children: [] }; + this.stack = [this.rootNode]; + } + + get top() { + return this.stack[this.stack.length - 1]; + } + + get root() { return this.rootNode; } + + /** @param {Node} node */ + add(node) { + this.top.children.push(node); + } + + /** @param {string} kind */ + openNode(kind) { + /** @type Node */ + const node = { kind, children: [] }; + this.add(node); + this.stack.push(node); + } + + closeNode() { + if (this.stack.length > 1) { + return this.stack.pop(); + } + // eslint-disable-next-line no-undefined + return undefined; + } + + closeAllNodes() { + while (this.closeNode()); + } + + toJSON() { + return JSON.stringify(this.rootNode, null, 4); + } + + /** + * @typedef { import("./html_renderer").Renderer } Renderer + * @param {Renderer} builder + */ + walk(builder) { + // this does not + return this.constructor._walk(builder, this.rootNode); + // this works + // return TokenTree._walk(builder, this.rootNode); + } + + /** + * @param {Renderer} builder + * @param {Node} node + */ + static _walk(builder, node) { + if (typeof node === "string") { + builder.addText(node); + } else if (node.children) { + builder.openNode(node); + node.children.forEach((child) => this._walk(builder, child)); + builder.closeNode(node); + } + return builder; + } + + /** + * @param {Node} node + */ + static _collapse(node) { + if (typeof node === "string") return; + if (!node.children) return; + + if (node.children.every(el => typeof el === "string")) { + // node.text = node.children.join(""); + // delete node.children; + node.children = [node.children.join("")]; + } else { + node.children.forEach((child) => { + TokenTree._collapse(child); + }); + } + } + } + + /** + Currently this is all private API, but this is the minimal API necessary + that an Emitter must implement to fully support the parser. + + Minimal interface: + + - addKeyword(text, kind) + - addText(text) + - addSublanguage(emitter, subLanguageName) + - finalize() + - openNode(kind) + - closeNode() + - closeAllNodes() + - toHTML() + + */ + + /** + * @implements {Emitter} + */ + class TokenTreeEmitter extends TokenTree { + /** + * @param {*} options + */ + constructor(options) { + super(); + this.options = options; + } + + /** + * @param {string} text + * @param {string} kind + */ + addKeyword(text, kind) { + if (text === "") { return; } + + this.openNode(kind); + this.addText(text); + this.closeNode(); + } + + /** + * @param {string} text + */ + addText(text) { + if (text === "") { return; } + + this.add(text); + } + + /** + * @param {Emitter & {root: DataNode}} emitter + * @param {string} name + */ + addSublanguage(emitter, name) { + /** @type DataNode */ + const node = emitter.root; + node.kind = name; + node.sublanguage = true; + this.add(node); + } + + toHTML() { + const renderer = new HTMLRenderer(this, this.options); + return renderer.value(); + } + + finalize() { + return true; + } + } + + /** + * @param {string} value + * @returns {RegExp} + * */ + + /** + * @param {RegExp | string } re + * @returns {string} + */ + function source(re) { + if (!re) return null; + if (typeof re === "string") return re; + + return re.source; + } + + /** + * @param {RegExp | string } re + * @returns {string} + */ + function lookahead(re) { + return concat('(?=', re, ')'); + } + + /** + * @param {RegExp | string } re + * @returns {string} + */ + function anyNumberOfTimes(re) { + return concat('(?:', re, ')*'); + } + + /** + * @param {RegExp | string } re + * @returns {string} + */ + function optional(re) { + return concat('(?:', re, ')?'); + } + + /** + * @param {...(RegExp | string) } args + * @returns {string} + */ + function concat(...args) { + const joined = args.map((x) => source(x)).join(""); + return joined; + } + + /** + * @param { Array } args + * @returns {object} + */ + function stripOptionsFromArgs(args) { + const opts = args[args.length - 1]; + + if (typeof opts === 'object' && opts.constructor === Object) { + args.splice(args.length - 1, 1); + return opts; + } else { + return {}; + } + } + + /** + * Any of the passed expresssions may match + * + * Creates a huge this | this | that | that match + * @param {(RegExp | string)[] } args + * @returns {string} + */ + function either(...args) { + /** @type { object & {capture?: boolean} } */ + const opts = stripOptionsFromArgs(args); + const joined = '(' + + (opts.capture ? "" : "?:") + + args.map((x) => source(x)).join("|") + ")"; + return joined; + } + + /** + * @param {RegExp | string} re + * @returns {number} + */ + function countMatchGroups(re) { + return (new RegExp(re.toString() + '|')).exec('').length - 1; + } + + /** + * Does lexeme start with a regular expression match at the beginning + * @param {RegExp} re + * @param {string} lexeme + */ + function startsWith(re, lexeme) { + const match = re && re.exec(lexeme); + return match && match.index === 0; + } + + // BACKREF_RE matches an open parenthesis or backreference. To avoid + // an incorrect parse, it additionally matches the following: + // - [...] elements, where the meaning of parentheses and escapes change + // - other escape sequences, so we do not misparse escape sequences as + // interesting elements + // - non-matching or lookahead parentheses, which do not capture. These + // follow the '(' with a '?'. + const BACKREF_RE = /\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./; + + // **INTERNAL** Not intended for outside usage + // join logically computes regexps.join(separator), but fixes the + // backreferences so they continue to match. + // it also places each individual regular expression into it's own + // match group, keeping track of the sequencing of those match groups + // is currently an exercise for the caller. :-) + /** + * @param {(string | RegExp)[]} regexps + * @param {{joinWith: string}} opts + * @returns {string} + */ + function _rewriteBackreferences(regexps, { joinWith }) { + let numCaptures = 0; + + return regexps.map((regex) => { + numCaptures += 1; + const offset = numCaptures; + let re = source(regex); + let out = ''; + + while (re.length > 0) { + const match = BACKREF_RE.exec(re); + if (!match) { + out += re; + break; + } + out += re.substring(0, match.index); + re = re.substring(match.index + match[0].length); + if (match[0][0] === '\\' && match[1]) { + // Adjust the backreference. + out += '\\' + String(Number(match[1]) + offset); + } else { + out += match[0]; + if (match[0] === '(') { + numCaptures++; + } + } + } + return out; + }).map(re => `(${re})`).join(joinWith); + } + + /** @typedef {import('highlight.js').Mode} Mode */ + /** @typedef {import('highlight.js').ModeCallback} ModeCallback */ + + // Common regexps + const MATCH_NOTHING_RE = /\b\B/; + const IDENT_RE = '[a-zA-Z]\\w*'; + const UNDERSCORE_IDENT_RE = '[a-zA-Z_]\\w*'; + const NUMBER_RE = '\\b\\d+(\\.\\d+)?'; + const C_NUMBER_RE = '(-?)(\\b0[xX][a-fA-F0-9]+|(\\b\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)'; // 0x..., 0..., decimal, float + const BINARY_NUMBER_RE = '\\b(0b[01]+)'; // 0b... + const RE_STARTERS_RE = '!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|-|-=|/=|/|:|;|<<|<<=|<=|<|===|==|=|>>>=|>>=|>=|>>>|>>|>|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~'; + + /** + * @param { Partial & {binary?: string | RegExp} } opts + */ + const SHEBANG = (opts = {}) => { + const beginShebang = /^#![ ]*\//; + if (opts.binary) { + opts.begin = concat( + beginShebang, + /.*\b/, + opts.binary, + /\b.*/); + } + return inherit$1({ + scope: 'meta', + begin: beginShebang, + end: /$/, + relevance: 0, + /** @type {ModeCallback} */ + "on:begin": (m, resp) => { + if (m.index !== 0) resp.ignoreMatch(); + } + }, opts); + }; + + // Common modes + const BACKSLASH_ESCAPE = { + begin: '\\\\[\\s\\S]', relevance: 0 + }; + const APOS_STRING_MODE = { + scope: 'string', + begin: '\'', + end: '\'', + illegal: '\\n', + contains: [BACKSLASH_ESCAPE] + }; + const QUOTE_STRING_MODE = { + scope: 'string', + begin: '"', + end: '"', + illegal: '\\n', + contains: [BACKSLASH_ESCAPE] + }; + const PHRASAL_WORDS_MODE = { + begin: /\b(a|an|the|are|I'm|isn't|don't|doesn't|won't|but|just|should|pretty|simply|enough|gonna|going|wtf|so|such|will|you|your|they|like|more)\b/ + }; + /** + * Creates a comment mode + * + * @param {string | RegExp} begin + * @param {string | RegExp} end + * @param {Mode | {}} [modeOptions] + * @returns {Partial} + */ + const COMMENT = function(begin, end, modeOptions = {}) { + const mode = inherit$1( + { + scope: 'comment', + begin, + end, + contains: [] + }, + modeOptions + ); + mode.contains.push({ + scope: 'doctag', + // hack to avoid the space from being included. the space is necessary to + // match here to prevent the plain text rule below from gobbling up doctags + begin: '[ ]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)', + end: /(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/, + excludeBegin: true, + relevance: 0 + }); + const ENGLISH_WORD = either( + // list of common 1 and 2 letter words in English + "I", + "a", + "is", + "so", + "us", + "to", + "at", + "if", + "in", + "it", + "on", + // note: this is not an exhaustive list of contractions, just popular ones + /[A-Za-z]+['](d|ve|re|ll|t|s|n)/, // contractions - can't we'd they're let's, etc + /[A-Za-z]+[-][a-z]+/, // `no-way`, etc. + /[A-Za-z][a-z]{2,}/ // allow capitalized words at beginning of sentences + ); + // looking like plain text, more likely to be a comment + mode.contains.push( + { + // TODO: how to include ", (, ) without breaking grammars that use these for + // comment delimiters? + // begin: /[ ]+([()"]?([A-Za-z'-]{3,}|is|a|I|so|us|[tT][oO]|at|if|in|it|on)[.]?[()":]?([.][ ]|[ ]|\))){3}/ + // --- + + // this tries to find sequences of 3 english words in a row (without any + // "programming" type syntax) this gives us a strong signal that we've + // TRULY found a comment - vs perhaps scanning with the wrong language. + // It's possible to find something that LOOKS like the start of the + // comment - but then if there is no readable text - good chance it is a + // false match and not a comment. + // + // for a visual example please see: + // https://github.com/highlightjs/highlight.js/issues/2827 + + begin: concat( + /[ ]+/, // necessary to prevent us gobbling up doctags like /* @author Bob Mcgill */ + '(', + ENGLISH_WORD, + /[.]?[:]?([.][ ]|[ ])/, + '){3}') // look for 3 words in a row + } + ); + return mode; + }; + const C_LINE_COMMENT_MODE = COMMENT('//', '$'); + const C_BLOCK_COMMENT_MODE = COMMENT('/\\*', '\\*/'); + const HASH_COMMENT_MODE = COMMENT('#', '$'); + const NUMBER_MODE = { + scope: 'number', + begin: NUMBER_RE, + relevance: 0 + }; + const C_NUMBER_MODE = { + scope: 'number', + begin: C_NUMBER_RE, + relevance: 0 + }; + const BINARY_NUMBER_MODE = { + scope: 'number', + begin: BINARY_NUMBER_RE, + relevance: 0 + }; + const REGEXP_MODE = { + // this outer rule makes sure we actually have a WHOLE regex and not simply + // an expression such as: + // + // 3 / something + // + // (which will then blow up when regex's `illegal` sees the newline) + begin: /(?=\/[^/\n]*\/)/, + contains: [{ + scope: 'regexp', + begin: /\//, + end: /\/[gimuy]*/, + illegal: /\n/, + contains: [ + BACKSLASH_ESCAPE, + { + begin: /\[/, + end: /\]/, + relevance: 0, + contains: [BACKSLASH_ESCAPE] + } + ] + }] + }; + const TITLE_MODE = { + scope: 'title', + begin: IDENT_RE, + relevance: 0 + }; + const UNDERSCORE_TITLE_MODE = { + scope: 'title', + begin: UNDERSCORE_IDENT_RE, + relevance: 0 + }; + const METHOD_GUARD = { + // excludes method names from keyword processing + begin: '\\.\\s*' + UNDERSCORE_IDENT_RE, + relevance: 0 + }; + + /** + * Adds end same as begin mechanics to a mode + * + * Your mode must include at least a single () match group as that first match + * group is what is used for comparison + * @param {Partial} mode + */ + const END_SAME_AS_BEGIN = function(mode) { + return Object.assign(mode, + { + /** @type {ModeCallback} */ + 'on:begin': (m, resp) => { resp.data._beginMatch = m[1]; }, + /** @type {ModeCallback} */ + 'on:end': (m, resp) => { if (resp.data._beginMatch !== m[1]) resp.ignoreMatch(); } + }); + }; + + var MODES = /*#__PURE__*/Object.freeze({ + __proto__: null, + MATCH_NOTHING_RE: MATCH_NOTHING_RE, + IDENT_RE: IDENT_RE, + UNDERSCORE_IDENT_RE: UNDERSCORE_IDENT_RE, + NUMBER_RE: NUMBER_RE, + C_NUMBER_RE: C_NUMBER_RE, + BINARY_NUMBER_RE: BINARY_NUMBER_RE, + RE_STARTERS_RE: RE_STARTERS_RE, + SHEBANG: SHEBANG, + BACKSLASH_ESCAPE: BACKSLASH_ESCAPE, + APOS_STRING_MODE: APOS_STRING_MODE, + QUOTE_STRING_MODE: QUOTE_STRING_MODE, + PHRASAL_WORDS_MODE: PHRASAL_WORDS_MODE, + COMMENT: COMMENT, + C_LINE_COMMENT_MODE: C_LINE_COMMENT_MODE, + C_BLOCK_COMMENT_MODE: C_BLOCK_COMMENT_MODE, + HASH_COMMENT_MODE: HASH_COMMENT_MODE, + NUMBER_MODE: NUMBER_MODE, + C_NUMBER_MODE: C_NUMBER_MODE, + BINARY_NUMBER_MODE: BINARY_NUMBER_MODE, + REGEXP_MODE: REGEXP_MODE, + TITLE_MODE: TITLE_MODE, + UNDERSCORE_TITLE_MODE: UNDERSCORE_TITLE_MODE, + METHOD_GUARD: METHOD_GUARD, + END_SAME_AS_BEGIN: END_SAME_AS_BEGIN + }); + + /** + @typedef {import('highlight.js').CallbackResponse} CallbackResponse + @typedef {import('highlight.js').CompilerExt} CompilerExt + */ + + // Grammar extensions / plugins + // See: https://github.com/highlightjs/highlight.js/issues/2833 + + // Grammar extensions allow "syntactic sugar" to be added to the grammar modes + // without requiring any underlying changes to the compiler internals. + + // `compileMatch` being the perfect small example of now allowing a grammar + // author to write `match` when they desire to match a single expression rather + // than being forced to use `begin`. The extension then just moves `match` into + // `begin` when it runs. Ie, no features have been added, but we've just made + // the experience of writing (and reading grammars) a little bit nicer. + + // ------ + + // TODO: We need negative look-behind support to do this properly + /** + * Skip a match if it has a preceding dot + * + * This is used for `beginKeywords` to prevent matching expressions such as + * `bob.keyword.do()`. The mode compiler automatically wires this up as a + * special _internal_ 'on:begin' callback for modes with `beginKeywords` + * @param {RegExpMatchArray} match + * @param {CallbackResponse} response + */ + function skipIfHasPrecedingDot(match, response) { + const before = match.input[match.index - 1]; + if (before === ".") { + response.ignoreMatch(); + } + } + + /** + * + * @type {CompilerExt} + */ + function scopeClassName(mode, _parent) { + // eslint-disable-next-line no-undefined + if (mode.className !== undefined) { + mode.scope = mode.className; + delete mode.className; + } + } + + /** + * `beginKeywords` syntactic sugar + * @type {CompilerExt} + */ + function beginKeywords(mode, parent) { + if (!parent) return; + if (!mode.beginKeywords) return; + + // for languages with keywords that include non-word characters checking for + // a word boundary is not sufficient, so instead we check for a word boundary + // or whitespace - this does no harm in any case since our keyword engine + // doesn't allow spaces in keywords anyways and we still check for the boundary + // first + mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?!\\.)(?=\\b|\\s)'; + mode.__beforeBegin = skipIfHasPrecedingDot; + mode.keywords = mode.keywords || mode.beginKeywords; + delete mode.beginKeywords; + + // prevents double relevance, the keywords themselves provide + // relevance, the mode doesn't need to double it + // eslint-disable-next-line no-undefined + if (mode.relevance === undefined) mode.relevance = 0; + } + + /** + * Allow `illegal` to contain an array of illegal values + * @type {CompilerExt} + */ + function compileIllegal(mode, _parent) { + if (!Array.isArray(mode.illegal)) return; + + mode.illegal = either(...mode.illegal); + } + + /** + * `match` to match a single expression for readability + * @type {CompilerExt} + */ + function compileMatch(mode, _parent) { + if (!mode.match) return; + if (mode.begin || mode.end) throw new Error("begin & end are not supported with match"); + + mode.begin = mode.match; + delete mode.match; + } + + /** + * provides the default 1 relevance to all modes + * @type {CompilerExt} + */ + function compileRelevance(mode, _parent) { + // eslint-disable-next-line no-undefined + if (mode.relevance === undefined) mode.relevance = 1; + } + + // allow beforeMatch to act as a "qualifier" for the match + // the full match begin must be [beforeMatch][begin] + const beforeMatchExt = (mode, parent) => { + if (!mode.beforeMatch) return; + // starts conflicts with endsParent which we need to make sure the child + // rule is not matched multiple times + if (mode.starts) throw new Error("beforeMatch cannot be used with starts"); + + const originalMode = Object.assign({}, mode); + Object.keys(mode).forEach((key) => { delete mode[key]; }); + + mode.keywords = originalMode.keywords; + mode.begin = concat(originalMode.beforeMatch, lookahead(originalMode.begin)); + mode.starts = { + relevance: 0, + contains: [ + Object.assign(originalMode, { endsParent: true }) + ] + }; + mode.relevance = 0; + + delete originalMode.beforeMatch; + }; + + // keywords that should have no default relevance value + const COMMON_KEYWORDS = [ + 'of', + 'and', + 'for', + 'in', + 'not', + 'or', + 'if', + 'then', + 'parent', // common variable name + 'list', // common variable name + 'value' // common variable name + ]; + + const DEFAULT_KEYWORD_SCOPE = "keyword"; + + /** + * Given raw keywords from a language definition, compile them. + * + * @param {string | Record | Array} rawKeywords + * @param {boolean} caseInsensitive + */ + function compileKeywords(rawKeywords, caseInsensitive, scopeName = DEFAULT_KEYWORD_SCOPE) { + /** @type KeywordDict */ + const compiledKeywords = Object.create(null); + + // input can be a string of keywords, an array of keywords, or a object with + // named keys representing scopeName (which can then point to a string or array) + if (typeof rawKeywords === 'string') { + compileList(scopeName, rawKeywords.split(" ")); + } else if (Array.isArray(rawKeywords)) { + compileList(scopeName, rawKeywords); + } else { + Object.keys(rawKeywords).forEach(function(scopeName) { + // collapse all our objects back into the parent object + Object.assign( + compiledKeywords, + compileKeywords(rawKeywords[scopeName], caseInsensitive, scopeName) + ); + }); + } + return compiledKeywords; + + // --- + + /** + * Compiles an individual list of keywords + * + * Ex: "for if when while|5" + * + * @param {string} scopeName + * @param {Array} keywordList + */ + function compileList(scopeName, keywordList) { + if (caseInsensitive) { + keywordList = keywordList.map(x => x.toLowerCase()); + } + keywordList.forEach(function(keyword) { + const pair = keyword.split('|'); + compiledKeywords[pair[0]] = [scopeName, scoreForKeyword(pair[0], pair[1])]; + }); + } + } + + /** + * Returns the proper score for a given keyword + * + * Also takes into account comment keywords, which will be scored 0 UNLESS + * another score has been manually assigned. + * @param {string} keyword + * @param {string} [providedScore] + */ + function scoreForKeyword(keyword, providedScore) { + // manual scores always win over common keywords + // so you can force a score of 1 if you really insist + if (providedScore) { + return Number(providedScore); + } + + return commonKeyword(keyword) ? 0 : 1; + } + + /** + * Determines if a given keyword is common or not + * + * @param {string} keyword */ + function commonKeyword(keyword) { + return COMMON_KEYWORDS.includes(keyword.toLowerCase()); + } + + /* + + For the reasoning behind this please see: + https://github.com/highlightjs/highlight.js/issues/2880#issuecomment-747275419 + + */ + + /** + * @type {Record} + */ + const seenDeprecations = {}; + + /** + * @param {string} message + */ + const error = (message) => { + console.error(message); + }; + + /** + * @param {string} message + * @param {any} args + */ + const warn = (message, ...args) => { + console.log(`WARN: ${message}`, ...args); + }; + + /** + * @param {string} version + * @param {string} message + */ + const deprecated = (version, message) => { + if (seenDeprecations[`${version}/${message}`]) return; + + console.log(`Deprecated as of ${version}. ${message}`); + seenDeprecations[`${version}/${message}`] = true; + }; + + /* eslint-disable no-throw-literal */ + + /** + @typedef {import('highlight.js').CompiledMode} CompiledMode + */ + + const MultiClassError = new Error(); + + /** + * Renumbers labeled scope names to account for additional inner match + * groups that otherwise would break everything. + * + * Lets say we 3 match scopes: + * + * { 1 => ..., 2 => ..., 3 => ... } + * + * So what we need is a clean match like this: + * + * (a)(b)(c) => [ "a", "b", "c" ] + * + * But this falls apart with inner match groups: + * + * (a)(((b)))(c) => ["a", "b", "b", "b", "c" ] + * + * Our scopes are now "out of alignment" and we're repeating `b` 3 times. + * What needs to happen is the numbers are remapped: + * + * { 1 => ..., 2 => ..., 5 => ... } + * + * We also need to know that the ONLY groups that should be output + * are 1, 2, and 5. This function handles this behavior. + * + * @param {CompiledMode} mode + * @param {Array} regexes + * @param {{key: "beginScope"|"endScope"}} opts + */ + function remapScopeNames(mode, regexes, { key }) { + let offset = 0; + const scopeNames = mode[key]; + /** @type Record */ + const emit = {}; + /** @type Record */ + const positions = {}; + + for (let i = 1; i <= regexes.length; i++) { + positions[i + offset] = scopeNames[i]; + emit[i + offset] = true; + offset += countMatchGroups(regexes[i - 1]); + } + // we use _emit to keep track of which match groups are "top-level" to avoid double + // output from inside match groups + mode[key] = positions; + mode[key]._emit = emit; + mode[key]._multi = true; + } + + /** + * @param {CompiledMode} mode + */ + function beginMultiClass(mode) { + if (!Array.isArray(mode.begin)) return; + + if (mode.skip || mode.excludeBegin || mode.returnBegin) { + error("skip, excludeBegin, returnBegin not compatible with beginScope: {}"); + throw MultiClassError; + } + + if (typeof mode.beginScope !== "object" || mode.beginScope === null) { + error("beginScope must be object"); + throw MultiClassError; + } + + remapScopeNames(mode, mode.begin, { key: "beginScope" }); + mode.begin = _rewriteBackreferences(mode.begin, { joinWith: "" }); + } + + /** + * @param {CompiledMode} mode + */ + function endMultiClass(mode) { + if (!Array.isArray(mode.end)) return; + + if (mode.skip || mode.excludeEnd || mode.returnEnd) { + error("skip, excludeEnd, returnEnd not compatible with endScope: {}"); + throw MultiClassError; + } + + if (typeof mode.endScope !== "object" || mode.endScope === null) { + error("endScope must be object"); + throw MultiClassError; + } + + remapScopeNames(mode, mode.end, { key: "endScope" }); + mode.end = _rewriteBackreferences(mode.end, { joinWith: "" }); + } + + /** + * this exists only to allow `scope: {}` to be used beside `match:` + * Otherwise `beginScope` would necessary and that would look weird + + { + match: [ /def/, /\w+/ ] + scope: { 1: "keyword" , 2: "title" } + } + + * @param {CompiledMode} mode + */ + function scopeSugar(mode) { + if (mode.scope && typeof mode.scope === "object" && mode.scope !== null) { + mode.beginScope = mode.scope; + delete mode.scope; + } + } + + /** + * @param {CompiledMode} mode + */ + function MultiClass(mode) { + scopeSugar(mode); + + if (typeof mode.beginScope === "string") { + mode.beginScope = { _wrap: mode.beginScope }; + } + if (typeof mode.endScope === "string") { + mode.endScope = { _wrap: mode.endScope }; + } + + beginMultiClass(mode); + endMultiClass(mode); + } + + /** + @typedef {import('highlight.js').Mode} Mode + @typedef {import('highlight.js').CompiledMode} CompiledMode + @typedef {import('highlight.js').Language} Language + @typedef {import('highlight.js').HLJSPlugin} HLJSPlugin + @typedef {import('highlight.js').CompiledLanguage} CompiledLanguage + */ + + // compilation + + /** + * Compiles a language definition result + * + * Given the raw result of a language definition (Language), compiles this so + * that it is ready for highlighting code. + * @param {Language} language + * @returns {CompiledLanguage} + */ + function compileLanguage(language) { + /** + * Builds a regex with the case sensitivity of the current language + * + * @param {RegExp | string} value + * @param {boolean} [global] + */ + function langRe(value, global) { + return new RegExp( + source(value), + 'm' + + (language.case_insensitive ? 'i' : '') + + (language.unicodeRegex ? 'u' : '') + + (global ? 'g' : '') + ); + } + + /** + Stores multiple regular expressions and allows you to quickly search for + them all in a string simultaneously - returning the first match. It does + this by creating a huge (a|b|c) regex - each individual item wrapped with () + and joined by `|` - using match groups to track position. When a match is + found checking which position in the array has content allows us to figure + out which of the original regexes / match groups triggered the match. + + The match object itself (the result of `Regex.exec`) is returned but also + enhanced by merging in any meta-data that was registered with the regex. + This is how we keep track of which mode matched, and what type of rule + (`illegal`, `begin`, end, etc). + */ + class MultiRegex { + constructor() { + this.matchIndexes = {}; + // @ts-ignore + this.regexes = []; + this.matchAt = 1; + this.position = 0; + } + + // @ts-ignore + addRule(re, opts) { + opts.position = this.position++; + // @ts-ignore + this.matchIndexes[this.matchAt] = opts; + this.regexes.push([opts, re]); + this.matchAt += countMatchGroups(re) + 1; + } + + compile() { + if (this.regexes.length === 0) { + // avoids the need to check length every time exec is called + // @ts-ignore + this.exec = () => null; + } + const terminators = this.regexes.map(el => el[1]); + this.matcherRe = langRe(_rewriteBackreferences(terminators, { joinWith: '|' }), true); + this.lastIndex = 0; + } + + /** @param {string} s */ + exec(s) { + this.matcherRe.lastIndex = this.lastIndex; + const match = this.matcherRe.exec(s); + if (!match) { return null; } + + // eslint-disable-next-line no-undefined + const i = match.findIndex((el, i) => i > 0 && el !== undefined); + // @ts-ignore + const matchData = this.matchIndexes[i]; + // trim off any earlier non-relevant match groups (ie, the other regex + // match groups that make up the multi-matcher) + match.splice(0, i); + + return Object.assign(match, matchData); + } + } + + /* + Created to solve the key deficiently with MultiRegex - there is no way to + test for multiple matches at a single location. Why would we need to do + that? In the future a more dynamic engine will allow certain matches to be + ignored. An example: if we matched say the 3rd regex in a large group but + decided to ignore it - we'd need to started testing again at the 4th + regex... but MultiRegex itself gives us no real way to do that. + + So what this class creates MultiRegexs on the fly for whatever search + position they are needed. + + NOTE: These additional MultiRegex objects are created dynamically. For most + grammars most of the time we will never actually need anything more than the + first MultiRegex - so this shouldn't have too much overhead. + + Say this is our search group, and we match regex3, but wish to ignore it. + + regex1 | regex2 | regex3 | regex4 | regex5 ' ie, startAt = 0 + + What we need is a new MultiRegex that only includes the remaining + possibilities: + + regex4 | regex5 ' ie, startAt = 3 + + This class wraps all that complexity up in a simple API... `startAt` decides + where in the array of expressions to start doing the matching. It + auto-increments, so if a match is found at position 2, then startAt will be + set to 3. If the end is reached startAt will return to 0. + + MOST of the time the parser will be setting startAt manually to 0. + */ + class ResumableMultiRegex { + constructor() { + // @ts-ignore + this.rules = []; + // @ts-ignore + this.multiRegexes = []; + this.count = 0; + + this.lastIndex = 0; + this.regexIndex = 0; + } + + // @ts-ignore + getMatcher(index) { + if (this.multiRegexes[index]) return this.multiRegexes[index]; + + const matcher = new MultiRegex(); + this.rules.slice(index).forEach(([re, opts]) => matcher.addRule(re, opts)); + matcher.compile(); + this.multiRegexes[index] = matcher; + return matcher; + } + + resumingScanAtSamePosition() { + return this.regexIndex !== 0; + } + + considerAll() { + this.regexIndex = 0; + } + + // @ts-ignore + addRule(re, opts) { + this.rules.push([re, opts]); + if (opts.type === "begin") this.count++; + } + + /** @param {string} s */ + exec(s) { + const m = this.getMatcher(this.regexIndex); + m.lastIndex = this.lastIndex; + let result = m.exec(s); + + // The following is because we have no easy way to say "resume scanning at the + // existing position but also skip the current rule ONLY". What happens is + // all prior rules are also skipped which can result in matching the wrong + // thing. Example of matching "booger": + + // our matcher is [string, "booger", number] + // + // ....booger.... + + // if "booger" is ignored then we'd really need a regex to scan from the + // SAME position for only: [string, number] but ignoring "booger" (if it + // was the first match), a simple resume would scan ahead who knows how + // far looking only for "number", ignoring potential string matches (or + // future "booger" matches that might be valid.) + + // So what we do: We execute two matchers, one resuming at the same + // position, but the second full matcher starting at the position after: + + // /--- resume first regex match here (for [number]) + // |/---- full match here for [string, "booger", number] + // vv + // ....booger.... + + // Which ever results in a match first is then used. So this 3-4 step + // process essentially allows us to say "match at this position, excluding + // a prior rule that was ignored". + // + // 1. Match "booger" first, ignore. Also proves that [string] does non match. + // 2. Resume matching for [number] + // 3. Match at index + 1 for [string, "booger", number] + // 4. If #2 and #3 result in matches, which came first? + if (this.resumingScanAtSamePosition()) { + if (result && result.index === this.lastIndex) ; else { // use the second matcher result + const m2 = this.getMatcher(0); + m2.lastIndex = this.lastIndex + 1; + result = m2.exec(s); + } + } + + if (result) { + this.regexIndex += result.position + 1; + if (this.regexIndex === this.count) { + // wrap-around to considering all matches again + this.considerAll(); + } + } + + return result; + } + } + + /** + * Given a mode, builds a huge ResumableMultiRegex that can be used to walk + * the content and find matches. + * + * @param {CompiledMode} mode + * @returns {ResumableMultiRegex} + */ + function buildModeRegex(mode) { + const mm = new ResumableMultiRegex(); + + mode.contains.forEach(term => mm.addRule(term.begin, { rule: term, type: "begin" })); + + if (mode.terminatorEnd) { + mm.addRule(mode.terminatorEnd, { type: "end" }); + } + if (mode.illegal) { + mm.addRule(mode.illegal, { type: "illegal" }); + } + + return mm; + } + + /** skip vs abort vs ignore + * + * @skip - The mode is still entered and exited normally (and contains rules apply), + * but all content is held and added to the parent buffer rather than being + * output when the mode ends. Mostly used with `sublanguage` to build up + * a single large buffer than can be parsed by sublanguage. + * + * - The mode begin ands ends normally. + * - Content matched is added to the parent mode buffer. + * - The parser cursor is moved forward normally. + * + * @abort - A hack placeholder until we have ignore. Aborts the mode (as if it + * never matched) but DOES NOT continue to match subsequent `contains` + * modes. Abort is bad/suboptimal because it can result in modes + * farther down not getting applied because an earlier rule eats the + * content but then aborts. + * + * - The mode does not begin. + * - Content matched by `begin` is added to the mode buffer. + * - The parser cursor is moved forward accordingly. + * + * @ignore - Ignores the mode (as if it never matched) and continues to match any + * subsequent `contains` modes. Ignore isn't technically possible with + * the current parser implementation. + * + * - The mode does not begin. + * - Content matched by `begin` is ignored. + * - The parser cursor is not moved forward. + */ + + /** + * Compiles an individual mode + * + * This can raise an error if the mode contains certain detectable known logic + * issues. + * @param {Mode} mode + * @param {CompiledMode | null} [parent] + * @returns {CompiledMode | never} + */ + function compileMode(mode, parent) { + const cmode = /** @type CompiledMode */ (mode); + if (mode.isCompiled) return cmode; + + [ + scopeClassName, + // do this early so compiler extensions generally don't have to worry about + // the distinction between match/begin + compileMatch, + MultiClass, + beforeMatchExt + ].forEach(ext => ext(mode, parent)); + + language.compilerExtensions.forEach(ext => ext(mode, parent)); + + // __beforeBegin is considered private API, internal use only + mode.__beforeBegin = null; + + [ + beginKeywords, + // do this later so compiler extensions that come earlier have access to the + // raw array if they wanted to perhaps manipulate it, etc. + compileIllegal, + // default to 1 relevance if not specified + compileRelevance + ].forEach(ext => ext(mode, parent)); + + mode.isCompiled = true; + + let keywordPattern = null; + if (typeof mode.keywords === "object" && mode.keywords.$pattern) { + // we need a copy because keywords might be compiled multiple times + // so we can't go deleting $pattern from the original on the first + // pass + mode.keywords = Object.assign({}, mode.keywords); + keywordPattern = mode.keywords.$pattern; + delete mode.keywords.$pattern; + } + keywordPattern = keywordPattern || /\w+/; + + if (mode.keywords) { + mode.keywords = compileKeywords(mode.keywords, language.case_insensitive); + } + + cmode.keywordPatternRe = langRe(keywordPattern, true); + + if (parent) { + if (!mode.begin) mode.begin = /\B|\b/; + cmode.beginRe = langRe(cmode.begin); + if (!mode.end && !mode.endsWithParent) mode.end = /\B|\b/; + if (mode.end) cmode.endRe = langRe(cmode.end); + cmode.terminatorEnd = source(cmode.end) || ''; + if (mode.endsWithParent && parent.terminatorEnd) { + cmode.terminatorEnd += (mode.end ? '|' : '') + parent.terminatorEnd; + } + } + if (mode.illegal) cmode.illegalRe = langRe(/** @type {RegExp | string} */ (mode.illegal)); + if (!mode.contains) mode.contains = []; + + mode.contains = [].concat(...mode.contains.map(function(c) { + return expandOrCloneMode(c === 'self' ? mode : c); + })); + mode.contains.forEach(function(c) { compileMode(/** @type Mode */ (c), cmode); }); + + if (mode.starts) { + compileMode(mode.starts, parent); + } + + cmode.matcher = buildModeRegex(cmode); + return cmode; + } + + if (!language.compilerExtensions) language.compilerExtensions = []; + + // self is not valid at the top-level + if (language.contains && language.contains.includes('self')) { + throw new Error("ERR: contains `self` is not supported at the top-level of a language. See documentation."); + } + + // we need a null object, which inherit will guarantee + language.classNameAliases = inherit$1(language.classNameAliases || {}); + + return compileMode(/** @type Mode */ (language)); + } + + /** + * Determines if a mode has a dependency on it's parent or not + * + * If a mode does have a parent dependency then often we need to clone it if + * it's used in multiple places so that each copy points to the correct parent, + * where-as modes without a parent can often safely be re-used at the bottom of + * a mode chain. + * + * @param {Mode | null} mode + * @returns {boolean} - is there a dependency on the parent? + * */ + function dependencyOnParent(mode) { + if (!mode) return false; + + return mode.endsWithParent || dependencyOnParent(mode.starts); + } + + /** + * Expands a mode or clones it if necessary + * + * This is necessary for modes with parental dependenceis (see notes on + * `dependencyOnParent`) and for nodes that have `variants` - which must then be + * exploded into their own individual modes at compile time. + * + * @param {Mode} mode + * @returns {Mode | Mode[]} + * */ + function expandOrCloneMode(mode) { + if (mode.variants && !mode.cachedVariants) { + mode.cachedVariants = mode.variants.map(function(variant) { + return inherit$1(mode, { variants: null }, variant); + }); + } + + // EXPAND + // if we have variants then essentially "replace" the mode with the variants + // this happens in compileMode, where this function is called from + if (mode.cachedVariants) { + return mode.cachedVariants; + } + + // CLONE + // if we have dependencies on parents then we need a unique + // instance of ourselves, so we can be reused with many + // different parents without issue + if (dependencyOnParent(mode)) { + return inherit$1(mode, { starts: mode.starts ? inherit$1(mode.starts) : null }); + } + + if (Object.isFrozen(mode)) { + return inherit$1(mode); + } + + // no special dependency issues, just return ourselves + return mode; + } + + var version = "11.3.1"; + + class HTMLInjectionError extends Error { + constructor(reason, html) { + super(reason); + this.name = "HTMLInjectionError"; + this.html = html; + } + } + + /* + Syntax highlighting with language autodetection. + https://highlightjs.org/ + */ + + /** + @typedef {import('highlight.js').Mode} Mode + @typedef {import('highlight.js').CompiledMode} CompiledMode + @typedef {import('highlight.js').CompiledScope} CompiledScope + @typedef {import('highlight.js').Language} Language + @typedef {import('highlight.js').HLJSApi} HLJSApi + @typedef {import('highlight.js').HLJSPlugin} HLJSPlugin + @typedef {import('highlight.js').PluginEvent} PluginEvent + @typedef {import('highlight.js').HLJSOptions} HLJSOptions + @typedef {import('highlight.js').LanguageFn} LanguageFn + @typedef {import('highlight.js').HighlightedHTMLElement} HighlightedHTMLElement + @typedef {import('highlight.js').BeforeHighlightContext} BeforeHighlightContext + @typedef {import('highlight.js/private').MatchType} MatchType + @typedef {import('highlight.js/private').KeywordData} KeywordData + @typedef {import('highlight.js/private').EnhancedMatch} EnhancedMatch + @typedef {import('highlight.js/private').AnnotatedError} AnnotatedError + @typedef {import('highlight.js').AutoHighlightResult} AutoHighlightResult + @typedef {import('highlight.js').HighlightOptions} HighlightOptions + @typedef {import('highlight.js').HighlightResult} HighlightResult + */ + + + const escape = escapeHTML; + const inherit = inherit$1; + const NO_MATCH = Symbol("nomatch"); + const MAX_KEYWORD_HITS = 7; + + /** + * @param {any} hljs - object that is extended (legacy) + * @returns {HLJSApi} + */ + const HLJS = function(hljs) { + // Global internal variables used within the highlight.js library. + /** @type {Record} */ + const languages = Object.create(null); + /** @type {Record} */ + const aliases = Object.create(null); + /** @type {HLJSPlugin[]} */ + const plugins = []; + + // safe/production mode - swallows more errors, tries to keep running + // even if a single syntax or parse hits a fatal error + let SAFE_MODE = true; + const LANGUAGE_NOT_FOUND = "Could not find the language '{}', did you forget to load/include a language module?"; + /** @type {Language} */ + const PLAINTEXT_LANGUAGE = { disableAutodetect: true, name: 'Plain text', contains: [] }; + + // Global options used when within external APIs. This is modified when + // calling the `hljs.configure` function. + /** @type HLJSOptions */ + let options = { + ignoreUnescapedHTML: false, + throwUnescapedHTML: false, + noHighlightRe: /^(no-?highlight)$/i, + languageDetectRe: /\blang(?:uage)?-([\w-]+)\b/i, + classPrefix: 'hljs-', + cssSelector: 'pre code', + languages: null, + // beta configuration options, subject to change, welcome to discuss + // https://github.com/highlightjs/highlight.js/issues/1086 + __emitter: TokenTreeEmitter + }; + + /* Utility functions */ + + /** + * Tests a language name to see if highlighting should be skipped + * @param {string} languageName + */ + function shouldNotHighlight(languageName) { + return options.noHighlightRe.test(languageName); + } + + /** + * @param {HighlightedHTMLElement} block - the HTML element to determine language for + */ + function blockLanguage(block) { + let classes = block.className + ' '; + + classes += block.parentNode ? block.parentNode.className : ''; + + // language-* takes precedence over non-prefixed class names. + const match = options.languageDetectRe.exec(classes); + if (match) { + const language = getLanguage(match[1]); + if (!language) { + warn(LANGUAGE_NOT_FOUND.replace("{}", match[1])); + warn("Falling back to no-highlight mode for this block.", block); + } + return language ? match[1] : 'no-highlight'; + } + + return classes + .split(/\s+/) + .find((_class) => shouldNotHighlight(_class) || getLanguage(_class)); + } + + /** + * Core highlighting function. + * + * OLD API + * highlight(lang, code, ignoreIllegals, continuation) + * + * NEW API + * highlight(code, {lang, ignoreIllegals}) + * + * @param {string} codeOrLanguageName - the language to use for highlighting + * @param {string | HighlightOptions} optionsOrCode - the code to highlight + * @param {boolean} [ignoreIllegals] - whether to ignore illegal matches, default is to bail + * + * @returns {HighlightResult} Result - an object that represents the result + * @property {string} language - the language name + * @property {number} relevance - the relevance score + * @property {string} value - the highlighted HTML code + * @property {string} code - the original raw code + * @property {CompiledMode} top - top of the current mode stack + * @property {boolean} illegal - indicates whether any illegal matches were found + */ + function highlight(codeOrLanguageName, optionsOrCode, ignoreIllegals) { + let code = ""; + let languageName = ""; + if (typeof optionsOrCode === "object") { + code = codeOrLanguageName; + ignoreIllegals = optionsOrCode.ignoreIllegals; + languageName = optionsOrCode.language; + } else { + // old API + deprecated("10.7.0", "highlight(lang, code, ...args) has been deprecated."); + deprecated("10.7.0", "Please use highlight(code, options) instead.\nhttps://github.com/highlightjs/highlight.js/issues/2277"); + languageName = codeOrLanguageName; + code = optionsOrCode; + } + + // https://github.com/highlightjs/highlight.js/issues/3149 + // eslint-disable-next-line no-undefined + if (ignoreIllegals === undefined) { ignoreIllegals = true; } + + /** @type {BeforeHighlightContext} */ + const context = { + code, + language: languageName + }; + // the plugin can change the desired language or the code to be highlighted + // just be changing the object it was passed + fire("before:highlight", context); + + // a before plugin can usurp the result completely by providing it's own + // in which case we don't even need to call highlight + const result = context.result + ? context.result + : _highlight(context.language, context.code, ignoreIllegals); + + result.code = context.code; + // the plugin can change anything in result to suite it + fire("after:highlight", result); + + return result; + } + + /** + * private highlight that's used internally and does not fire callbacks + * + * @param {string} languageName - the language to use for highlighting + * @param {string} codeToHighlight - the code to highlight + * @param {boolean?} [ignoreIllegals] - whether to ignore illegal matches, default is to bail + * @param {CompiledMode?} [continuation] - current continuation mode, if any + * @returns {HighlightResult} - result of the highlight operation + */ + function _highlight(languageName, codeToHighlight, ignoreIllegals, continuation) { + const keywordHits = Object.create(null); + + /** + * Return keyword data if a match is a keyword + * @param {CompiledMode} mode - current mode + * @param {string} matchText - the textual match + * @returns {KeywordData | false} + */ + function keywordData(mode, matchText) { + return mode.keywords[matchText]; + } + + function processKeywords() { + if (!top.keywords) { + emitter.addText(modeBuffer); + return; + } + + let lastIndex = 0; + top.keywordPatternRe.lastIndex = 0; + let match = top.keywordPatternRe.exec(modeBuffer); + let buf = ""; + + while (match) { + buf += modeBuffer.substring(lastIndex, match.index); + const word = language.case_insensitive ? match[0].toLowerCase() : match[0]; + const data = keywordData(top, word); + if (data) { + const [kind, keywordRelevance] = data; + emitter.addText(buf); + buf = ""; + + keywordHits[word] = (keywordHits[word] || 0) + 1; + if (keywordHits[word] <= MAX_KEYWORD_HITS) relevance += keywordRelevance; + if (kind.startsWith("_")) { + // _ implied for relevance only, do not highlight + // by applying a class name + buf += match[0]; + } else { + const cssClass = language.classNameAliases[kind] || kind; + emitter.addKeyword(match[0], cssClass); + } + } else { + buf += match[0]; + } + lastIndex = top.keywordPatternRe.lastIndex; + match = top.keywordPatternRe.exec(modeBuffer); + } + buf += modeBuffer.substr(lastIndex); + emitter.addText(buf); + } + + function processSubLanguage() { + if (modeBuffer === "") return; + /** @type HighlightResult */ + let result = null; + + if (typeof top.subLanguage === 'string') { + if (!languages[top.subLanguage]) { + emitter.addText(modeBuffer); + return; + } + result = _highlight(top.subLanguage, modeBuffer, true, continuations[top.subLanguage]); + continuations[top.subLanguage] = /** @type {CompiledMode} */ (result._top); + } else { + result = highlightAuto(modeBuffer, top.subLanguage.length ? top.subLanguage : null); + } + + // Counting embedded language score towards the host language may be disabled + // with zeroing the containing mode relevance. Use case in point is Markdown that + // allows XML everywhere and makes every XML snippet to have a much larger Markdown + // score. + if (top.relevance > 0) { + relevance += result.relevance; + } + emitter.addSublanguage(result._emitter, result.language); + } + + function processBuffer() { + if (top.subLanguage != null) { + processSubLanguage(); + } else { + processKeywords(); + } + modeBuffer = ''; + } + + /** + * @param {CompiledScope} scope + * @param {RegExpMatchArray} match + */ + function emitMultiClass(scope, match) { + let i = 1; + // eslint-disable-next-line no-undefined + while (match[i] !== undefined) { + if (!scope._emit[i]) { i++; continue; } + const klass = language.classNameAliases[scope[i]] || scope[i]; + const text = match[i]; + if (klass) { + emitter.addKeyword(text, klass); + } else { + modeBuffer = text; + processKeywords(); + modeBuffer = ""; + } + i++; + } + } + + /** + * @param {CompiledMode} mode - new mode to start + * @param {RegExpMatchArray} match + */ + function startNewMode(mode, match) { + if (mode.scope && typeof mode.scope === "string") { + emitter.openNode(language.classNameAliases[mode.scope] || mode.scope); + } + if (mode.beginScope) { + // beginScope just wraps the begin match itself in a scope + if (mode.beginScope._wrap) { + emitter.addKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap); + modeBuffer = ""; + } else if (mode.beginScope._multi) { + // at this point modeBuffer should just be the match + emitMultiClass(mode.beginScope, match); + modeBuffer = ""; + } + } + + top = Object.create(mode, { parent: { value: top } }); + return top; + } + + /** + * @param {CompiledMode } mode - the mode to potentially end + * @param {RegExpMatchArray} match - the latest match + * @param {string} matchPlusRemainder - match plus remainder of content + * @returns {CompiledMode | void} - the next mode, or if void continue on in current mode + */ + function endOfMode(mode, match, matchPlusRemainder) { + let matched = startsWith(mode.endRe, matchPlusRemainder); + + if (matched) { + if (mode["on:end"]) { + const resp = new Response(mode); + mode["on:end"](match, resp); + if (resp.isMatchIgnored) matched = false; + } + + if (matched) { + while (mode.endsParent && mode.parent) { + mode = mode.parent; + } + return mode; + } + } + // even if on:end fires an `ignore` it's still possible + // that we might trigger the end node because of a parent mode + if (mode.endsWithParent) { + return endOfMode(mode.parent, match, matchPlusRemainder); + } + } + + /** + * Handle matching but then ignoring a sequence of text + * + * @param {string} lexeme - string containing full match text + */ + function doIgnore(lexeme) { + if (top.matcher.regexIndex === 0) { + // no more regexes to potentially match here, so we move the cursor forward one + // space + modeBuffer += lexeme[0]; + return 1; + } else { + // no need to move the cursor, we still have additional regexes to try and + // match at this very spot + resumeScanAtSamePosition = true; + return 0; + } + } + + /** + * Handle the start of a new potential mode match + * + * @param {EnhancedMatch} match - the current match + * @returns {number} how far to advance the parse cursor + */ + function doBeginMatch(match) { + const lexeme = match[0]; + const newMode = match.rule; + + const resp = new Response(newMode); + // first internal before callbacks, then the public ones + const beforeCallbacks = [newMode.__beforeBegin, newMode["on:begin"]]; + for (const cb of beforeCallbacks) { + if (!cb) continue; + cb(match, resp); + if (resp.isMatchIgnored) return doIgnore(lexeme); + } + + if (newMode.skip) { + modeBuffer += lexeme; + } else { + if (newMode.excludeBegin) { + modeBuffer += lexeme; + } + processBuffer(); + if (!newMode.returnBegin && !newMode.excludeBegin) { + modeBuffer = lexeme; + } + } + startNewMode(newMode, match); + return newMode.returnBegin ? 0 : lexeme.length; + } + + /** + * Handle the potential end of mode + * + * @param {RegExpMatchArray} match - the current match + */ + function doEndMatch(match) { + const lexeme = match[0]; + const matchPlusRemainder = codeToHighlight.substr(match.index); + + const endMode = endOfMode(top, match, matchPlusRemainder); + if (!endMode) { return NO_MATCH; } + + const origin = top; + if (top.endScope && top.endScope._wrap) { + processBuffer(); + emitter.addKeyword(lexeme, top.endScope._wrap); + } else if (top.endScope && top.endScope._multi) { + processBuffer(); + emitMultiClass(top.endScope, match); + } else if (origin.skip) { + modeBuffer += lexeme; + } else { + if (!(origin.returnEnd || origin.excludeEnd)) { + modeBuffer += lexeme; + } + processBuffer(); + if (origin.excludeEnd) { + modeBuffer = lexeme; + } + } + do { + if (top.scope) { + emitter.closeNode(); + } + if (!top.skip && !top.subLanguage) { + relevance += top.relevance; + } + top = top.parent; + } while (top !== endMode.parent); + if (endMode.starts) { + startNewMode(endMode.starts, match); + } + return origin.returnEnd ? 0 : lexeme.length; + } + + function processContinuations() { + const list = []; + for (let current = top; current !== language; current = current.parent) { + if (current.scope) { + list.unshift(current.scope); + } + } + list.forEach(item => emitter.openNode(item)); + } + + /** @type {{type?: MatchType, index?: number, rule?: Mode}}} */ + let lastMatch = {}; + + /** + * Process an individual match + * + * @param {string} textBeforeMatch - text preceding the match (since the last match) + * @param {EnhancedMatch} [match] - the match itself + */ + function processLexeme(textBeforeMatch, match) { + const lexeme = match && match[0]; + + // add non-matched text to the current mode buffer + modeBuffer += textBeforeMatch; + + if (lexeme == null) { + processBuffer(); + return 0; + } + + // we've found a 0 width match and we're stuck, so we need to advance + // this happens when we have badly behaved rules that have optional matchers to the degree that + // sometimes they can end up matching nothing at all + // Ref: https://github.com/highlightjs/highlight.js/issues/2140 + if (lastMatch.type === "begin" && match.type === "end" && lastMatch.index === match.index && lexeme === "") { + // spit the "skipped" character that our regex choked on back into the output sequence + modeBuffer += codeToHighlight.slice(match.index, match.index + 1); + if (!SAFE_MODE) { + /** @type {AnnotatedError} */ + const err = new Error(`0 width match regex (${languageName})`); + err.languageName = languageName; + err.badRule = lastMatch.rule; + throw err; + } + return 1; + } + lastMatch = match; + + if (match.type === "begin") { + return doBeginMatch(match); + } else if (match.type === "illegal" && !ignoreIllegals) { + // illegal match, we do not continue processing + /** @type {AnnotatedError} */ + const err = new Error('Illegal lexeme "' + lexeme + '" for mode "' + (top.scope || '') + '"'); + err.mode = top; + throw err; + } else if (match.type === "end") { + const processed = doEndMatch(match); + if (processed !== NO_MATCH) { + return processed; + } + } + + // edge case for when illegal matches $ (end of line) which is technically + // a 0 width match but not a begin/end match so it's not caught by the + // first handler (when ignoreIllegals is true) + if (match.type === "illegal" && lexeme === "") { + // advance so we aren't stuck in an infinite loop + return 1; + } + + // infinite loops are BAD, this is a last ditch catch all. if we have a + // decent number of iterations yet our index (cursor position in our + // parsing) still 3x behind our index then something is very wrong + // so we bail + if (iterations > 100000 && iterations > match.index * 3) { + const err = new Error('potential infinite loop, way more iterations than matches'); + throw err; + } + + /* + Why might be find ourselves here? An potential end match that was + triggered but could not be completed. IE, `doEndMatch` returned NO_MATCH. + (this could be because a callback requests the match be ignored, etc) + + This causes no real harm other than stopping a few times too many. + */ + + modeBuffer += lexeme; + return lexeme.length; + } + + const language = getLanguage(languageName); + if (!language) { + error(LANGUAGE_NOT_FOUND.replace("{}", languageName)); + throw new Error('Unknown language: "' + languageName + '"'); + } + + const md = compileLanguage(language); + let result = ''; + /** @type {CompiledMode} */ + let top = continuation || md; + /** @type Record */ + const continuations = {}; // keep continuations for sub-languages + const emitter = new options.__emitter(options); + processContinuations(); + let modeBuffer = ''; + let relevance = 0; + let index = 0; + let iterations = 0; + let resumeScanAtSamePosition = false; + + try { + top.matcher.considerAll(); + + for (;;) { + iterations++; + if (resumeScanAtSamePosition) { + // only regexes not matched previously will now be + // considered for a potential match + resumeScanAtSamePosition = false; + } else { + top.matcher.considerAll(); + } + top.matcher.lastIndex = index; + + const match = top.matcher.exec(codeToHighlight); + // console.log("match", match[0], match.rule && match.rule.begin) + + if (!match) break; + + const beforeMatch = codeToHighlight.substring(index, match.index); + const processedCount = processLexeme(beforeMatch, match); + index = match.index + processedCount; + } + processLexeme(codeToHighlight.substr(index)); + emitter.closeAllNodes(); + emitter.finalize(); + result = emitter.toHTML(); + + return { + language: languageName, + value: result, + relevance: relevance, + illegal: false, + _emitter: emitter, + _top: top + }; + } catch (err) { + if (err.message && err.message.includes('Illegal')) { + return { + language: languageName, + value: escape(codeToHighlight), + illegal: true, + relevance: 0, + _illegalBy: { + message: err.message, + index: index, + context: codeToHighlight.slice(index - 100, index + 100), + mode: err.mode, + resultSoFar: result + }, + _emitter: emitter + }; + } else if (SAFE_MODE) { + return { + language: languageName, + value: escape(codeToHighlight), + illegal: false, + relevance: 0, + errorRaised: err, + _emitter: emitter, + _top: top + }; + } else { + throw err; + } + } + } + + /** + * returns a valid highlight result, without actually doing any actual work, + * auto highlight starts with this and it's possible for small snippets that + * auto-detection may not find a better match + * @param {string} code + * @returns {HighlightResult} + */ + function justTextHighlightResult(code) { + const result = { + value: escape(code), + illegal: false, + relevance: 0, + _top: PLAINTEXT_LANGUAGE, + _emitter: new options.__emitter(options) + }; + result._emitter.addText(code); + return result; + } + + /** + Highlighting with language detection. Accepts a string with the code to + highlight. Returns an object with the following properties: + + - language (detected language) + - relevance (int) + - value (an HTML string with highlighting markup) + - secondBest (object with the same structure for second-best heuristically + detected language, may be absent) + + @param {string} code + @param {Array} [languageSubset] + @returns {AutoHighlightResult} + */ + function highlightAuto(code, languageSubset) { + languageSubset = languageSubset || options.languages || Object.keys(languages); + const plaintext = justTextHighlightResult(code); + + const results = languageSubset.filter(getLanguage).filter(autoDetection).map(name => + _highlight(name, code, false) + ); + results.unshift(plaintext); // plaintext is always an option + + const sorted = results.sort((a, b) => { + // sort base on relevance + if (a.relevance !== b.relevance) return b.relevance - a.relevance; + + // always award the tie to the base language + // ie if C++ and Arduino are tied, it's more likely to be C++ + if (a.language && b.language) { + if (getLanguage(a.language).supersetOf === b.language) { + return 1; + } else if (getLanguage(b.language).supersetOf === a.language) { + return -1; + } + } + + // otherwise say they are equal, which has the effect of sorting on + // relevance while preserving the original ordering - which is how ties + // have historically been settled, ie the language that comes first always + // wins in the case of a tie + return 0; + }); + + const [best, secondBest] = sorted; + + /** @type {AutoHighlightResult} */ + const result = best; + result.secondBest = secondBest; + + return result; + } + + /** + * Builds new class name for block given the language name + * + * @param {HTMLElement} element + * @param {string} [currentLang] + * @param {string} [resultLang] + */ + function updateClassName(element, currentLang, resultLang) { + const language = (currentLang && aliases[currentLang]) || resultLang; + + element.classList.add("hljs"); + element.classList.add(`language-${language}`); + } + + /** + * Applies highlighting to a DOM node containing code. + * + * @param {HighlightedHTMLElement} element - the HTML element to highlight + */ + function highlightElement(element) { + /** @type HTMLElement */ + let node = null; + const language = blockLanguage(element); + + if (shouldNotHighlight(language)) return; + + fire("before:highlightElement", + { el: element, language: language }); + + // we should be all text, no child nodes (unescaped HTML) - this is possibly + // an HTML injection attack - it's likely too late if this is already in + // production (the code has likely already done its damage by the time + // we're seeing it)... but we yell loudly about this so that hopefully it's + // more likely to be caught in development before making it to production + if (element.children.length > 0) { + if (!options.ignoreUnescapedHTML) { + console.warn("One of your code blocks includes unescaped HTML. This is a potentially serious security risk."); + console.warn("https://github.com/highlightjs/highlight.js/issues/2886"); + console.warn(element); + } + if (options.throwUnescapedHTML) { + const err = new HTMLInjectionError( + "One of your code blocks includes unescaped HTML.", + element.innerHTML + ); + throw err; + } + } + + node = element; + const text = node.textContent; + const result = language ? highlight(text, { language, ignoreIllegals: true }) : highlightAuto(text); + + element.innerHTML = result.value; + updateClassName(element, language, result.language); + element.result = { + language: result.language, + // TODO: remove with version 11.0 + re: result.relevance, + relevance: result.relevance + }; + if (result.secondBest) { + element.secondBest = { + language: result.secondBest.language, + relevance: result.secondBest.relevance + }; + } + + fire("after:highlightElement", { el: element, result, text }); + } + + /** + * Updates highlight.js global options with the passed options + * + * @param {Partial} userOptions + */ + function configure(userOptions) { + options = inherit(options, userOptions); + } + + // TODO: remove v12, deprecated + const initHighlighting = () => { + highlightAll(); + deprecated("10.6.0", "initHighlighting() deprecated. Use highlightAll() now."); + }; + + // TODO: remove v12, deprecated + function initHighlightingOnLoad() { + highlightAll(); + deprecated("10.6.0", "initHighlightingOnLoad() deprecated. Use highlightAll() now."); + } + + let wantsHighlight = false; + + /** + * auto-highlights all pre>code elements on the page + */ + function highlightAll() { + // if we are called too early in the loading process + if (document.readyState === "loading") { + wantsHighlight = true; + return; + } + + const blocks = document.querySelectorAll(options.cssSelector); + blocks.forEach(highlightElement); + } + + function boot() { + // if a highlight was requested before DOM was loaded, do now + if (wantsHighlight) highlightAll(); + } + + // make sure we are in the browser environment + if (typeof window !== 'undefined' && window.addEventListener) { + window.addEventListener('DOMContentLoaded', boot, false); + } + + /** + * Register a language grammar module + * + * @param {string} languageName + * @param {LanguageFn} languageDefinition + */ + function registerLanguage(languageName, languageDefinition) { + let lang = null; + try { + lang = languageDefinition(hljs); + } catch (error$1) { + error("Language definition for '{}' could not be registered.".replace("{}", languageName)); + // hard or soft error + if (!SAFE_MODE) { throw error$1; } else { error(error$1); } + // languages that have serious errors are replaced with essentially a + // "plaintext" stand-in so that the code blocks will still get normal + // css classes applied to them - and one bad language won't break the + // entire highlighter + lang = PLAINTEXT_LANGUAGE; + } + // give it a temporary name if it doesn't have one in the meta-data + if (!lang.name) lang.name = languageName; + languages[languageName] = lang; + lang.rawDefinition = languageDefinition.bind(null, hljs); + + if (lang.aliases) { + registerAliases(lang.aliases, { languageName }); + } + } + + /** + * Remove a language grammar module + * + * @param {string} languageName + */ + function unregisterLanguage(languageName) { + delete languages[languageName]; + for (const alias of Object.keys(aliases)) { + if (aliases[alias] === languageName) { + delete aliases[alias]; + } + } + } + + /** + * @returns {string[]} List of language internal names + */ + function listLanguages() { + return Object.keys(languages); + } + + /** + * @param {string} name - name of the language to retrieve + * @returns {Language | undefined} + */ + function getLanguage(name) { + name = (name || '').toLowerCase(); + return languages[name] || languages[aliases[name]]; + } + + /** + * + * @param {string|string[]} aliasList - single alias or list of aliases + * @param {{languageName: string}} opts + */ + function registerAliases(aliasList, { languageName }) { + if (typeof aliasList === 'string') { + aliasList = [aliasList]; + } + aliasList.forEach(alias => { aliases[alias.toLowerCase()] = languageName; }); + } + + /** + * Determines if a given language has auto-detection enabled + * @param {string} name - name of the language + */ + function autoDetection(name) { + const lang = getLanguage(name); + return lang && !lang.disableAutodetect; + } + + /** + * Upgrades the old highlightBlock plugins to the new + * highlightElement API + * @param {HLJSPlugin} plugin + */ + function upgradePluginAPI(plugin) { + // TODO: remove with v12 + if (plugin["before:highlightBlock"] && !plugin["before:highlightElement"]) { + plugin["before:highlightElement"] = (data) => { + plugin["before:highlightBlock"]( + Object.assign({ block: data.el }, data) + ); + }; + } + if (plugin["after:highlightBlock"] && !plugin["after:highlightElement"]) { + plugin["after:highlightElement"] = (data) => { + plugin["after:highlightBlock"]( + Object.assign({ block: data.el }, data) + ); + }; + } + } + + /** + * @param {HLJSPlugin} plugin + */ + function addPlugin(plugin) { + upgradePluginAPI(plugin); + plugins.push(plugin); + } + + /** + * + * @param {PluginEvent} event + * @param {any} args + */ + function fire(event, args) { + const cb = event; + plugins.forEach(function(plugin) { + if (plugin[cb]) { + plugin[cb](args); + } + }); + } + + /** + * DEPRECATED + * @param {HighlightedHTMLElement} el + */ + function deprecateHighlightBlock(el) { + deprecated("10.7.0", "highlightBlock will be removed entirely in v12.0"); + deprecated("10.7.0", "Please use highlightElement now."); + + return highlightElement(el); + } + + /* Interface definition */ + Object.assign(hljs, { + highlight, + highlightAuto, + highlightAll, + highlightElement, + // TODO: Remove with v12 API + highlightBlock: deprecateHighlightBlock, + configure, + initHighlighting, + initHighlightingOnLoad, + registerLanguage, + unregisterLanguage, + listLanguages, + getLanguage, + registerAliases, + autoDetection, + inherit, + addPlugin + }); + + hljs.debugMode = function() { SAFE_MODE = false; }; + hljs.safeMode = function() { SAFE_MODE = true; }; + hljs.versionString = version; + + hljs.regex = { + concat: concat, + lookahead: lookahead, + either: either, + optional: optional, + anyNumberOfTimes: anyNumberOfTimes + }; + + for (const key in MODES) { + // @ts-ignore + if (typeof MODES[key] === "object") { + // @ts-ignore + deepFreeze$1(MODES[key]); + } + } + + // merge all the modes/regexes into our main object + Object.assign(hljs, MODES); + + return hljs; + }; + + // export an "instance" of the highlighter + var highlight = HLJS({}); + + return highlight; + +})(); +if (typeof exports === 'object' && typeof module !== 'undefined') { module.exports = hljs; } diff --git a/book/llvm.min.js b/book/llvm.min.js new file mode 100644 index 0000000000..d9a279c4c3 --- /dev/null +++ b/book/llvm.min.js @@ -0,0 +1,19 @@ +/*! `llvm` grammar compiled for Highlight.js 11.3.1 */ + function concat(...args) { + const joined = args.map((x) => source(x)).join(""); + return joined; + } + +(()=>{var e=(()=>{"use strict";return e=>{ +const a=e.regex,n=/([-a-zA-Z$._][\w$.-]*)/,t={className:"variable",variants:[{ +begin:a.concat(/%/,n)},{begin:/%\d+/},{begin:/#\d+/}]},i={className:"title", +variants:[{begin:a.concat(/@/,n)},{begin:/@\d+/},{begin:a.concat(/!/,n)},{ +begin:a.concat(/!\d+/,n)},{begin:/!\d+/}]};return{name:"LLVM IR", +keywords:"begin end true false declare define global constant private linker_private internal available_externally linkonce linkonce_odr weak weak_odr appending dllimport dllexport common default hidden protected extern_weak external thread_local zeroinitializer undef null to tail target triple datalayout volatile nuw nsw nnan ninf nsz arcp fast exact inbounds align addrspace section alias module asm sideeffect gc dbg linker_private_weak attributes blockaddress initialexec localdynamic localexec prefix unnamed_addr ccc fastcc coldcc x86_stdcallcc x86_fastcallcc arm_apcscc arm_aapcscc arm_aapcs_vfpcc ptx_device ptx_kernel intel_ocl_bicc msp430_intrcc spir_func spir_kernel x86_64_sysvcc x86_64_win64cc x86_thiscallcc cc c signext zeroext inreg sret nounwind noreturn noalias nocapture byval nest readnone readonly inlinehint noinline alwaysinline optsize ssp sspreq noredzone noimplicitfloat naked builtin cold nobuiltin noduplicate nonlazybind optnone returns_twice sanitize_address sanitize_memory sanitize_thread sspstrong uwtable returned type opaque eq ne slt sgt sle sge ult ugt ule uge oeq one olt ogt ole oge ord uno ueq une x acq_rel acquire alignstack atomic catch cleanup filter inteldialect max min monotonic nand personality release seq_cst singlethread umax umin unordered xchg add fadd sub fsub mul fmul udiv sdiv fdiv urem srem frem shl lshr ashr and or xor icmp fcmp phi call trunc zext sext fptrunc fpext uitofp sitofp fptoui fptosi inttoptr ptrtoint bitcast addrspacecast select va_arg ret br switch invoke unwind unreachable indirectbr landingpad resume malloc alloca free load store getelementptr extractelement insertelement shufflevector getresult extractvalue insertvalue atomicrmw cmpxchg fence argmemonly double", +contains:[{className:"type",begin:/\bi\d+(?=\s|\b)/},e.COMMENT(/;\s*$/,null,{ +relevance:0}),e.COMMENT(/;/,/$/),e.QUOTE_STRING_MODE,{className:"string", +variants:[{begin:/"/,end:/[^\\]"/}]},i,{className:"punctuation",relevance:0, +begin:/,/},{className:"operator",relevance:0,begin:/=/},t,{className:"symbol", +variants:[{begin:/^\s*[a-z]+:/}],relevance:0},{className:"number",variants:[{ +begin:/0[xX][a-fA-F0-9]+/},{begin:/-?\d+(?:[.]\d+)?(?:[eE][-+]?\d+(?:[.]\d+)?)?/ +}],relevance:0}]}}})();hljs.registerLanguage("llvm",e)})(); \ No newline at end of file diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 05c522095f..ce863ff4be 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -12,3 +12,9 @@ - [Variables](./variables.md) - [Datatypes](./datatypes.md) - [Direct Bit Access](./direct_variables.md) +- [Architecture](./arch/architecture.md) + - [Parser](./arch/parser.md) + - [Indexer & Symbol-Table](./arch/indexer.md) + - [Linker](./arch/linker.md) + - [Validation](./arch/validation.md) + - [Codegen](./arch/codegen.md) diff --git a/book/src/arch/architecture.md b/book/src/arch/architecture.md new file mode 100644 index 0000000000..e827deb099 --- /dev/null +++ b/book/src/arch/architecture.md @@ -0,0 +1,39 @@ +# Architecture + +## Overview + +Rusty is a Compiler for Structured Text. It utilizes the llvm compiler infrastructurue and contributes a [Structured Text](https://en.wikipedia.org/wiki/Structured_text) Frontend that translates Structured Text into llvm's language independent intermediate representatin (IR). The Further optimization and native code generation is performed by the existing LLVM infrastructure, namely llvm's common optimizer and the platform specific backend (see [here](https://www.aosabook.org/en/llvm.html)). + + +```ignore + ┌──────────────────┐ ┌───────────────┐ ┌────────────────┐ + │ │ │ │ │ │ + │ Rusty │ │ LLVM Common │ │ LLVM Backend │ + │ ├───►│ ├───►│ │ + │ LLVM Frontend │ │ Optimizer │ │ (e.g Clang) │ + │ │ │ │ │ │ + └──────────────────┘ └───────────────┘ └────────────────┘ +``` + +So Rusty consists of the frontend part of the llvm compiler-infrastructure. This means that this compiler can benefit from llvm's existing compiler-optimizations, as well as all backend target platforms available. + +## Rusty Frontend Architecture + +Ultimately the goal of a compiler frontend, is to translate the original source code into the infrastructure's intermediate representation (in this case we're talking about [LLVM IR](https://llvm.org/docs/LangRef.html)). Rusty treats this task as a compilation step of its own. While a fully fledged compiler generates machine code as a last step, rusty generates LLVM IR assembly code. + +```ignore + ┌────────┐ ┌────────┐ + │ Source │ │ LLVM │ + │ │ │ IR │ + │ Files │ │ │ + └───┬────┘ └────────┘ + │ ▲ + ▼ │ + ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌──────┴─────┐ + │ │ │ │ │ │ │ │ │ │ + │ │ │ │ │ │ │ │ │ │ + │ Parser ├──►│ Indexer ├──►│ Linker ├──►│ Validation ├──►│ Codegen │ + │ │ │ │ │ │ │ │ │ │ + │ │ │ │ │ │ │ │ │ │ + └────────────┘ └────────────┘ └────────────┘ └────────────┘ └────────────┘ +``` diff --git a/book/src/arch/codegen.md b/book/src/arch/codegen.md new file mode 100644 index 0000000000..86c9fcf25f --- /dev/null +++ b/book/src/arch/codegen.md @@ -0,0 +1,146 @@ +# Code-Generation +The codegen module contains all code that turns the parsed and verified code represented as an AST into [llvm-ir](https://llvm.org/docs/LangRef.html) code. To generate the *IR* we use a crate that wraps the native llvm [C-API](https://github.com/TheDan64/inkwell). + +The code-generator is basically a transformation from the ST-AST into an IR-Tree representation. Therefore the AST is traversed ina visitor-like way and transformed simultaneously. The code generation is split into specialized sub-generators for different tasks: + +| Generator | Responsibilities | +|--------------------|------------------| +| pou_generator | The pou-generator takes care of generating the programming organization units (Programs, FunctionBlocks, Functions) including their signature and body. More specialized tasks are delegated to other generators. | +| data_type_generator | Generates complex datatypes like Structs, Arrays, Enums, Strings, etc. | +| variable_generator | Generates global variables and their initialization | +| statement_generator | Generates everything of the body of a POU except expressions. Non-expressions include: IFs, Loops, Assignments, etc. | +| expression_generator | Generates expressions (everything that *possibly* resolves to a value) including: call-statements, references, array-access, etc. | + +## Generating POUs +Generating POUs (Programs, Function-Blocks, Functions) must generate the POU's body itself, as well as the POU's interface (or state) variables. In this segment we focus on generating the interface for a POU. Further information about generating a POU's body can be found [here]. +### Programs +A program is *static* POU with some code attached. This means that there is exactly one instance. So wherever from it is called, every caller uses the exact same instance which means that you may see the residuals of the laster caller in the program's variables when you call it yourself. + +```iecst +PROGRAM prg + VAR + x : DINT; + y : DINT; + END_VAR + +END_PROGRAM +``` + +The program's interface is persistent across calls, so we store it in a global variable. Therefore the code-generator creates a dedicated struct-type called `prg_interface`. A global variable called `prg_instance` is generated to store the program's state across calls. This global instance variable is passed as a `this` pointer to calls to the `prg` function. + +```llvm +%prg_interface = type { i32, i32 } + +@prg_instance = global %prg_interface zeroinitializer + +define void @prg(%prg_interface* %this) { +entry: + ret void +} +``` + +### FunctionBlocks +A FunctionBlock is an POU that is instantiated in a declaration. So in contrast to Programs, a FunctionBlock can have multiple instances. Nevertheless the code-generator uses a very similar strategy. A struct-type for the FunctionBlock's interface is created but no global instance-variable is allocated. Instead the function block can be used as a DataType to declare instances like in the following example: + +```iecst +FUNCTION_BLOCK foo + VAR_INPUT + x, y : INT; + END_VAR +END_FUNCTION_BLOCK + +PROGRAM prg + VAR + f : foo; + END_VAR +END_PROGRAM +``` + +So for the given example, we see the code-generator creating a type for the FunctionBlock's state (`foo_interface`). The declared instance of foo, in `prg's` interface is seen in the program's generated interface struct-type (`prg_interface`). + +```llvm +; ModuleID = 'main' +source_filename = "main" + +%prg_interface = type { %foo_interface } +%foo_interface = type { i16, i16 } + +@prg_instance = global %prg_interface zeroinitializer + +define void @foo(%foo_interface* %0) { +entry: + ret void +} + +define void @prg(%prg_interface* %0) { +entry: + ret void +} +``` + +### Functions +Functions generate very similar to program's and function_block's. The main difference is, that no instance-global is allocated and the function's interface-type cannot be used as a datatype to declare your own instances. Instances of the program's interface-type are allocated whenever the function is called for the lifetime of a single call. Otherwise the code generated for functions is comparable to the code presented above for programs and function-blocks. + +## Generating Data Types +IEC61131-3 languages offer a wide range of data types. Next to the built-in intrinsic data types, we support following user defined data types: +### Range Types +For range types we don't generate special code. Internally the new data type just becomes an alias for the derived type. +### Pointer Types +For pointer types we don't generate special code. Internally the new data type just becomes an alias for the pointer-type. +### Struct Types +Struct types translate direclty to llvm struct datatypes. We generate a new datatype with the user-type's name for the struct. +```iecst +TYPE MyStruct: + STRUCT + a: DINT; + b: INT; + END_STRUCT +END_TYPE +``` +This struct simply generates a llvm struct type: +```llvm +%MyStruct = type { i32, i16 } +``` +### Enum Types +Enumerations are represented as `DINT`. +```iecst +TYPE MyEnum: (red, yellow, green); +END_TYPE +``` +For every enum's element we generate a global variable with the element's value. +```llvm +@red = global i32 0 +@yellow = global i32 1 +@green = global i32 2 +``` + +### Array Types +Array types are generated as fixed sized llvm vector types - note that Array types must be fixed sized in *ST*: +```iecst +TYPE MyArray: ARRAY[0..9] OF INT; +END_TYPE + +VAR_GLOBAL + x : MyArray; + y : ARRAY[0..5] OF REAL; +END_VAR +``` +Custom array data types are not reflected as dedicated types on the llvm-level. +```llvm +@x = global [10 x i16] zeroinitializer +@y = global [6 x float] zeroinitializer +``` +### String Types +String types are generated as fixed sized vector types. +```iecst +VAR_GLOBAL + str : STRING[20]; + wstr : WSTRING[20]; +END_VAR +``` +Strings can be represented in two different encodings: *UTF-8 (STRING)* or *UTF-16 (WSTRING)*. +```llvm +@str = global [21 x i8] zeroinitializer +@wstr = global [21 x i16] zeroinitializer +``` + diff --git a/book/src/arch/indexer.md b/book/src/arch/indexer.md new file mode 100644 index 0000000000..6401a92ac8 --- /dev/null +++ b/book/src/arch/indexer.md @@ -0,0 +1,108 @@ +# Indexer + +The indexing step is responsible of building and maintaining the Symbol-Table (also called *Index*). The *Index* contains all known referable objects such as *variables*, *data-types*, *POUs*, *Functions*, etc. The Symbol-Table also maintains additional information about every referable object such as: the object's type, the objects' datatype, etc. + +Indexing is performed by the *index* module. It contains the index itself (a.k.a. Symbol Table), the *visitor* which collects all global names and their additional information as well as a data structure that handles compile time constant expressions (*constant_expressions*). + +## The Index (Symbol Table) +The index stores information about all referable elements of the program. Depending on the type of element, we store different meta-information alongside the name of the element. + +| Index Field | Description | +|---------------------------|------------------------------------------| +| global_variables | All global variables accessible via their name. | +| enum_global_variables | All enum elements accessible via their name (as if they were global variables, e.g. 'RED') | +| enum_qualified_variables | All enum elements accessible via their qualified name (e.g. 'COLOR.RED'). | +| member_variables | Member variables of structured types (Structs,Functionblocks, etc. This map allows to query all members of a container by name.) | +| implementations | All callable implementations (Programs, Functions, Actions, Functionblocks) accessible by their name. | +| type_index | All data-types (intrinsic and complex) accessible via their name | +| constant_expressions | The results of constant expressions that can be evaluated at compile time (e.g. the initializer of a constant: `VAR_GLOBAL CONST TAU := 3.1415 * 2; END_VAR`) | + +There are 3 different type of entries in the index: +- **VariableIndexEntry** +The VariableIndexEntry holds information about every *Variable* in the source code and offers additional information relevant for linking, validation and code-generation. + +```ignore + ┌─────────────────────────────┐ ┌─────────────────┐ + │ VariableIndexEntry │ │ │ + │ │ │ VariableType │ + ├─────────────────────────────┤ var_type ├─────────────────┤ + │ │ │ - Local │ + │ - name: String ├─────────────►│ - Temp │ + │ - qualified_name: String │ │ - Input │ + │ - is_constant: bool │ │ - Output │ + │ - location_in_parent: u32 │ │ - InOut │ + │ - data_type_name: String │ │ - Global │ + │ │ │ - Return │ + └───────────┬─────────────────┘ └─────────────────┘ + │ + │initial_value + │ + │ + │ ┌──────────────────┐ + │ │ ConstExpression │ + │ 0..1 ├──────────────────┤ + └───────────►│ │ + │ ... │ + │ │ + └──────────────────┘ +``` + +- **ImplementationIndexEntry** +The ImplementationIndexEntry offers information about any callable implementation (Program, Functionblock, Function, etc.). It also offers metadata about the implementation type, the name of the method to call and the name of the parameter-struct (this-struct) to pass to the function. + +```ignore + ┌───────────────────────┐ + ┌──────────────────────────┐ │ │ + │ ImplementationIndexEntry │ │ ImplementationType │ + ├──────────────────────────┤ type │ │ + │ ├─────────────►├───────────────────────┤ + │ - call_name: String │ │ - Program │ + │ - type_name: String │ │ - Function │ + │ │ │ - FunctionBlock │ + └──────────────────────────┘ │ - Action │ + │ - Class │ + │ - Method │ + │ │ + └───────────────────────┘ +``` + +- **DataType** +The entry for a DataType offers information about any data-type supported by the program to be compiled (internal data types as well as user defined data types). For each data-type we offer additional information such as it's initial value, its type-nature (in terms of generic functions - e.g: ANY_INT) and some additional information about the type's internal structure and size (e.g. is it a number/array/struct/etc). + +```ignore + ┌─────────────┐ ┌────────────────────┐ + │ DataType │ │ ConstantExpression │ + ├─────────────┤ initial_value ├────────────────────┤ + │ ├──────────────────►│ │ + │ - name │ │ ... │ + │ ├─────────┐ │ │ + └──────┬──────┘ │ └────────────────────┘ + │ │ + │ │ ┌────────────────────┐ + │ │ │ TypeNature │ + │ │ ├────────────────────┤ + │ information │ │ - Any │ + │ └────────►│ - Derived │ + │ nature │ - Elementary │ + │ │ - Num │ + ▼ │ - Int │ + ┌───────────────────────┐ │ - Signed │ + │ │ │ - ... │ + │ DataTypeInformation │ └────────────────────┘ + ├───────────────────────┤ + │ │ + └───────────────────────┘ + ▲ + │ + │ + │ + ┌────────────────┬───────┴───────┬──────────────┬──────────────┐ + │ │ │ │ │ +┌────────┴───────┐ ┌──────┴──────┐ ┌──────┴─────┐ ┌─────┴──────┐ ┌────┴─────┐ +│ Struct │ │ Array │ │ Integer │ │ String │ │ ... │ +├────────────────┤ ├─────────────┤ ├────────────┤ ├────────────┤ ├──────────┤ +│ - name │ │- name │ │ - name │ │ - size │ │ ... │ +│ - member_names │ │- inner_type │ │ - signed │ │ - encoding │ │ │ +│ │ │- dimensions │ │ - size │ │ │ │ │ +└────────────────┘ └─────────────┘ └────────────┘ └────────────┘ └──────────┘ +``` diff --git a/book/src/arch/linker.md b/book/src/arch/linker.md new file mode 100644 index 0000000000..30074844cc --- /dev/null +++ b/book/src/arch/linker.md @@ -0,0 +1,287 @@ +# Linker +The linker's task is to decide where all references in the source code point to. There are different references in Structured Text: +- variable references +`x := 4` where *x* is a reference to the variable x. +- type references +`i : MyFunctionBlock` where *MyFunctionBlock* is a reference to the declared FunctionBlock +- Program references +`PLC_PRG.x := 4` where *PLC_PRG* is a reference to a Program-POU called *PLC_PRG* +- Function references +`max(a, b)` where *max* is a reference to a Function-POU called *max*. + +So the linker decides where a reference points to. A reference has a corresponding declaration that matches the reference's name: +```iecst + PROGRAM PLC_PRG + VAR + + ┌──────► x : INT; + │ + │ END_VAR + │ + └────┐ + │ + x := 3; + END_PROGRAM +``` + +The linker's results will be used by the semantic validation step and by the code-generation. + +The validator decides whether the name you put at a certain location is valid or not. In order to decide whether a certain reference is valid or not, we need to know where it is pointing to, so whether we expect a variable, a datatype or something different. + +The code-generation needs to know what certain names mean, in order to successfully generate the IR-code that reflects the behavior of your program. + +## Annotated Syntax Tree +The AST generated by the parser is a pretty static data-structure. So where should we store the linking information for a reference? Even if we would add fields for potential linking-information to the AST, the ownership concepts of Rust would give us a hard time to fill this information piece by piece during linking. So what we end up doing, is to use the [arena-pattern](https://en.wikipedia.org/wiki/Region-based_memory_management) to handle the different lifetimes of the parts of an AST (the AST itself is constructed very early in the compilation process, where the linking information is allocated later). We don't store the linking information directly in the AST, but we store it inside the mentioned arena-data-structure and link it with certain AST-elements. + +The rusty linker stores the linking information in an arena called AnnotationMap. The AnnotationMap can store two type of annotations for any AST-element. So the first step is that we need a way to uniquely identify every single AST-node so we can use this ID as a key for the annotations stored in the AnnotationMap to automatically associate it with the given AST-Node. The parser assigns a unique ID to every Statement-Tree-Node (Note that we only assign IDs to Statements, not every AST-Node). + +So the expression `a + 3` now looks like this: + +```ignore + ┌─────────────────┐ + │ BinaryOperation │ + ├─────────────────┤ + │ operator: Plus │ + │ ID: 1 │ + └──────┬──┬───────┘ + │ │ + left │ │ right + ┌───────────┘ └──────────┐ + │ │ + │ │ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ Reference │ │ LiteralInteger │ + ├──────────────────┤ ├──────────────────┤ + │ name: 'a' │ │ value: '3' │ + │ ID: 2 │ │ ID: 3 │ + └──────────────────┘ └──────────────────┘ +``` + +The AnnotationMap stores 5 different types of annotation: +- `Value` +The Value-annotation indicates that this AST-Element resolves to a value with the given resulting datatype. So for Example the LiteralInteger(3) node gets a Value-Annotation with a resulting type of `DINT`. +```ignore + ┌─────────────────────────┐ + │ Value │ + ├─────────────────────────┤ + │ │ + │ resulting_type: String │ + │ │ + └─────────────────────────┘ +``` + +- `Variable` +The Variable-annotation indicates that this AST-Element resolves to a variable with the given qualified name (and some comfort-information like whether it is a constant and whether it is an auto-deref pointer). Similar to the value-Annotation it also saves the resulting datatype. +```ignore + ┌─────────────────────────┐ + │ Variable │ + ├─────────────────────────┤ + │ │ + │ resulting_type: String │ + │ qualified_name: String │ + │ constant: bool │ + │ is_auto_deref: bool │ + │ │ + └─────────────────────────┘ +``` + +- `Function` +The Function-annotation indicates that this AST-Element resolves to a Function-POU (a call-statement) with the given qualified name. Similar to the value-Annotation it also saves the resulting datatype but this time as the function's return type (*return_type*). +```ignore + ┌─────────────────────────┐ + │ Function │ + ├─────────────────────────┤ + │ │ + │ return_type: String │ + │ qualified_name: String │ + │ │ + └─────────────────────────┘ +``` + +- `Type` +The Type-annotation indicates that this AST-Element resolves to a DataType (e.g. a Declaration: `x: INT`) with the given name. + +```ignore + ┌─────────────────────────┐ + │ Type │ + ├─────────────────────────┤ + │ │ + │ type_name : String │ + │ │ + └─────────────────────────┘ +``` + +- `Program` +The Program-annotation is very similar to the Function-annotation. Since a Program has no return-value it also offers no return-type information. +```ignore + ┌─────────────────────────┐ + │ Program │ + ├─────────────────────────┤ + │ │ + │ qualified_name: String │ + │ │ + └─────────────────────────┘ +``` + +So the example expression from above `a + 3* will be annotated like this: Note that the resulting type of the Binary-Operation must be calculated by the linker by determining the bigger of both types. + +```ignore + ┌─────────────────┐ + │ BinaryOperation │ + ├─────────────────┤ + │ operator: Plus │ + │ ID: 1 │ + └──────┬──┬───────┘ + │ │ + left │ │ right + ┌───────────┘ └──────────┐ + │ │ + │ │ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ Reference │ │ LiteralInteger │ + ├──────────────────┤ ├──────────────────┤ + │ name: 'a' │ │ value: '3' │ + │ ID: 2 │ │ ID: 3 │ + └──────────────────┘ └──────────────────┘ + + + + ┌────────────────────────────┐ + │ Value │ +┌───────────────────┐ ├────────────────────────────┤ +│ AnnotationMap │ ┌───►│ resulting_type: DINT │ +│ │ │ │ │ +├───────┬───────────┤ │ └────────────────────────────┘ +│ ID: 1 │ Value ├───┘ +├───────┼───────────┤ ┌────────────────────────────┐ +│ ID: 2 │ Variable ├────┐ │ Variable │ +├───────┼───────────┤ │ ├────────────────────────────┤ +│ ID: 3 │ Value ├──┐ │ │ resulting_type: SINT │ +└───────┴───────────┘ │ └──►│ qualified_name: PLC_PRG.a │ + │ │ constant: false │ + │ │ is_auto_deref: false │ + │ └────────────────────────────┘ + │ + │ ┌────────────────────────────┐ + │ │ Value │ + │ ├────────────────────────────┤ + └────►│ resulting_type: DINT │ + │ │ + └────────────────────────────┘ +``` + +Another example where the annotated AST carries a lot of useful information is with complex expressions like array-expressions or qualified references. Lets consider the following statement: + +```iecst +PLC_PRG.a.b[2] +``` +It is annotated in the following way: +```ignore + ┌────────────────────┐ + │ QualifiedReference │ + ├────────────────────┤ + │ ID: 1 │ + └─────────┬──────────┘ + │ elements: Vec + ┌─────────┴──────────┬─────────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ + │ Reference │ │ Reference │ │ ArrayAccess │ + ├──────────────────┤ ├──────────────────┤ ├──────────────────┤ + │ name: 'PLC_PRG' │ │ name: 'a' │ │ │ + │ ID: 2 │ │ ID: 3 │ │ ID: 4 │ + └──────────────────┘ └──────────────────┘ └─────┬──────┬─────┘ + │ │ + reference │ │ access + ┌────────┘ └─────────┐ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ Reference │ │ LiteralInteger │ + ├──────────────────┤ ├──────────────────┤ + │ name: 'b' │ │ value: '2' │ + │ ID: 5 │ │ ID: 6 │ + └──────────────────┘ └──────────────────┘ + + + ┌────────────────────────────┐ + │ Value │ + ┌───►├────────────────────────────┤ + │ │ resulting_type: INT │ + │ │ │ + │ └────────────────────────────┘ + │ + │ ┌────────────────────────────┐ + ┌───────────────────┐ │ │ Program │ + │ AnnotationMap │ │ ┌─►├────────────────────────────┤ + │ │ │ │ │ qualified_name: PLC_PRG │ + ├───────┬───────────┤ │ │ │ │ + │ ID: 1 │ Value ├───┘ │ └────────────────────────────┘ + ├───────┼───────────┤ │ + │ ID: 2 │ Program ├─────┘ ┌────────────────────────────┐ + ├───────┼───────────┤ │ Variable │ + │ ID: 3 │ Variable ├───────►├────────────────────────────┤ + ├───────┼───────────┤ │ resulting_type: MyStruct │ + │ ID: 4 │ Value ├─────┐ │ qualified_name: PLC_PRG.a │ + ├───────┼───────────┤ │ └────────────────────────────┘ + │ ID: 5 │ Variable ├───┐ │ + ├───────┼───────────┤ │ │ ┌────────────────────────────┐ + │ ID: 6 │ Value ├─┐ │ │ │ Value │ + └───────┴───────────┘ │ │ └─►├────────────────────────────┤ + │ │ │ resulting_type: INT │ + │ │ │ │ + │ │ └────────────────────────────┘ + │ │ + │ │ ┌─────────────────────────────────┐ + │ │ │ Variable │ + │ └───►├─────────────────────────────────┤ + │ │ resulting_type : ARRAY[] OF INT │ + │ │ qualified_name : MyStruct.b │ + │ └─────────────────────────────────┘ + │ + │ ┌────────────────────────────┐ + │ │ Value │ + │ ├────────────────────────────┤ + └─────►│ resulting_type: DINT │ + │ │ + └────────────────────────────┘ +``` + +## Type vs. Type-Hint +The AnnotationMap not only offers annotations regarding the AST-node's type, but it also offers a second type of annotation. + +Consider the following snippet: +```iecst +PROGRAM PLC_PRG + VAR + x : SINT; + y : INT; + z : BYTE; + END_VAR + + z := x + y; + +END_PROGRAM +``` + +The assignment `z := x + y` is loaded with different types: + +- `x` is annotated as *Variable* of type *SINT* and will be auto-upgraded to *DINT* +- `y` is annotated as *Variable* of type *INT* and will be auto-upgraded to *DINT* +- `z` is annotated as *Variable* of type *BYTE* +- `x + y` is annotated as *Value* of type *DINT* (the bigger of both) + +In order to make life easier for validation and code-generation we add an additional annotation to `x + y` to indicate, that while it technically results in a *DINT*, it should rather be treated as a *BYTE* since it is going to be assigned to `z`. This second annotation is called the *type-hint*. It indicates that while it technically is not the real type of this expression, the program's semantic wants the compiler to treat it as this type. + +The expression `z := x + y` is annotated like this: + +| expression | type annotation | type-hint annotation | explanation | +|-------------|-----------------|-----------------------|-----------------------| +| `x` | SINT | DINT | auto-upgraded to DINT | +| `y` | INT | DINT | auto-upgraded to DINT | +| `z` | BYTE | - | | +| `x + y` | DINT | BYTE | type-hint indicates that the resulting DINT needs to be cast to BYTE | + +With the help of the type-hint annotations the validation can decide whether certain type-cast operations are valid very easily. The code-generation steps can easily decide when to generate casts, by simply comparing a node's type annotation and it's type-hint annotation. \ No newline at end of file diff --git a/book/src/arch/parser.md b/book/src/arch/parser.md new file mode 100644 index 0000000000..4c489ab456 --- /dev/null +++ b/book/src/arch/parser.md @@ -0,0 +1,91 @@ +# Parser + + The role of the parser is to turn source-code which is fed as a string (in the form of files) into a tree-representation of that source-code. This tree is typically called the *Abstract Syntax Tree (AST)*. The step of parsing consists of two distinct stages. + The first one is the *lexical analysis (Lexer)* which is performed by a lexer. After lexing we perform the *syntactical analysis (Parser)* to construct the syntax tree. + +```ignore + ┌──┐ + ┌──────────────┐ │ │ + │ │ └──┘ + │ Source Code │ / \ + │ │ ┌─────────┐ ┌──────────┐ / \ + │ ────────── │ │ │ │ │ ┌──┐ ┌──┐ + │ ├───► Lexer │ │ Parser ├────►│ │ │ │ + │ ───────── │ │ │ │ │ └──┘ └──┘ + │ │ └────┬────┘ └──────────┘ /\ /\ + │ ──── │ │ ▲ / \ / \ + │ │ │ │ ┌──┐ ┌──┐ ┌──┐ ┌──┐ + │ ──────── │ ▼ │ │ │ │ │ │ │ │ │ + │ │ ┌───────────────────────┴──┐ └──┘ └──┘ └──┘ └──┘ + │ │ │ │ + └──────────────┘ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ │ Abstract Syntax + │ │ T │ │ T │ │ T │ │...│ │ Tree + │ └───┘ └───┘ └───┘ └───┘ │ + │ │ + └──────────────────────────┘ + Token-Stream +``` + +## Lexer +The lexer performs the lexical analysis. This step turns the source-string into a sequence of well known tokens. The Lexer (or sometimes also called *tokenizer*) splits the source-string into *tokens* (or *words*). Each token has a distinct type which corresponds to a grammar's element. Typical token-types are keywords, numbers, identifiers, brackets, dots, etc. So with the help of this token-stream it is much easier for the parser to spot certain patterns. E.g. a floating-point number consists of the token-sequence: *number*, *dot*, *number*. + +The lexer is implemented in the `lexer`-module. It uses the [logos](https://github.com/maciejhirsz/logos) crate to create a lexer that is able to identify all different terminal-symbols. Compared to other languages, Structured Text has a quite high number of keywords and other tokens, so Rusty's lexer identifies a quite large number of different tokens. + +### Discussion: Rusty-Lexer +The logos crate uses [procedural macros](https://doc.rust-lang.org/reference/procedural-macros.html) to generate the code required to lex the source-string. The number of tokens identified by the rusty-lexer is quite high, so as of january 2022 the rust sdk for vs-code (rust-analyzer) reports problem with the number of macro-generated tokens (*macro invocation exceeds token limit...*). + +The tokens identified by the lexer follow the formal definition provided by the IEC61131-3 (2013) standard. + +Following strategies increase the number of tokens and should be reconsidered: +- case insensitivity +- optional underscores in keywords (e.g. `END_IF` == `ENDIF`) +- unrolled tokens instead of grouping tokens (e.g. `KEYWORD_TRUE` & `KEYWORD_FALSE` instead of `KEYWORD_BOOL`) +- etc. + +## Parser +The parser takes the token stream and creates the corresponding AST that represents the source code in a structured, hierarchical way. The parser is implemented in the `parser` module whereas the model for the AST is implemented in the `ast` module. + +### AST - Abstract Syntax Tree + +The abstract syntax tree is a tree representation of the source code. Some parser implementations use a generic tree-data-structure consisting of `Nodes` which can have an arbitrary number of children. These nodes usually have dynamic properties like a type and an optional value and sometimes they even have dynamic properties stored in a map to make this representation even more flexible. + +While this approach needs very little source code we decided to favour a less flexible approach. The rusty-AST models every single ast-node as its own *struct* with all necessary fields including the possible child-nodes. While this approach needs much more code and much mor hand-written changes, its benefits lie in the clearness and simplicity of the data-structure. Every element of the AST is easily identified, debugged and understood. (e.g. while in a generic node based AST it is easily possible to have a binary-statement with no, one, or seven child-nodes, the rusty-AST enforces the structure of every node. So the rusty-Binary-Statement has exactly two children. It is impossible to construct it differently.). + +#### Example +So an assignment `a := 3;` will be parsed with the help of the following Structures: + +```rs +struct Reference { + name: string +} + +struct LiteralInteger { + value: i128 +} + +struct Assignment { + left: Box, + right: Box +} +``` + +### Recursive Descent Parser +There are a lot of different frameworks to generate parsers from formal grammars. While they generate highly optimized parsers we felt we wanted more control and more understanding of the parsing process and the resulting AST. The fact that at that point in time we were pretty new to rust itself, writing the parser by hand also gave us more practice and a stronger feeling of control and understanding. Using a parser-generator framework will definitely be an option for future improvements. + +As for now, the parser is a hand-written [recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser) inside the `parser`-module. + +As the parser reads the token stream `Reference`, `KeywordEquals`, `Number`, `Semicolon` it instantiates the corresponding syntax tree: +```ignore + ┌─────────────────┐ + │ Assignment │ + └──────┬──┬───────┘ + left │ │ right + ┌───────────┘ └──────────┐ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ Reference │ │ LiteralInteger │ + ├──────────────────┤ ├──────────────────┤ + │ name: 'a' │ │ value: '3' │ + └──────────────────┘ └──────────────────┘ +``` + diff --git a/book/src/arch/validation.md b/book/src/arch/validation.md new file mode 100644 index 0000000000..6d13b45523 --- /dev/null +++ b/book/src/arch/validation.md @@ -0,0 +1,19 @@ +# Validation +The validation module implements the semantic validation step of the compiler. The validator is a hand-written visitor that offers a callback when visiting the single AST-nodes to then perform the different validation tasks. The validation rules are implemented in dedicated validator-structs: + +| Validator | Responsibilities | +|--------------------|------------------| +| pou_validator | Semantic rules on the level of Programs, Functionblocks and Functions | +| variable_validator | Semantic rules on the level of variable declarations (e.g. empty var-blocks, empty structs, etc. ) | +| stmt_validator | Semantic rules on the level of statements (e.g. invalid type-casts ) | + +## Diagnostics +Problems (semantic or syntactic) are represented as *Diagnostics* [^1]. Diagnostics carry information on the exact location inside the source-string (start- & end-offset), a custom message and a unique error-number to identify the problem. There are 3 types of *Diagnostics*: + +| Diagnostic | Description | +|--------------------|------------------| +| SyntaxError | A syntax error is a diagnostic that is created by the parser if it discovers a token-stream that does not match the language's grammar. | +| GeneralError | General errors are problems that occured during the compilation process, that cannot be linked to a malformed input (e.g. file-I/O problems, internal LLVM errors, etc.) | +| Improvement | Problems that may not prevent successful compilation but are still considered a flaw in the source-code. (e.g. use proprietary *POINTER TO* instead of the norm-compliant *REF_TO*) | + +[^1]: :(i): The diagnostics are subject to change since they don't elegantly represent the different types of problems (e.g. semantic problems). \ No newline at end of file diff --git a/src/codegen/tests/code_gen_tests.rs b/src/codegen/tests/code_gen_tests.rs index dc33bae5c5..1f09957a65 100644 --- a/src/codegen/tests/code_gen_tests.rs +++ b/src/codegen/tests/code_gen_tests.rs @@ -176,17 +176,7 @@ END_VAR END_PROGRAM "#, ); - let expected = generate_program_boiler_plate( - "prg", - &[("i32", "x"), ("i32", "y")], - "void", - "", - "", - r#"ret void -"#, - ); - - assert_eq!(result, expected); + insta::assert_snapshot!(result); } #[test] @@ -3688,6 +3678,9 @@ fn function_block_instance_call() { let result = codegen( " FUNCTION_BLOCK foo + VAR_INPUT + x, y : INT; + END_VAR END_FUNCTION_BLOCK PROGRAM prg @@ -3698,41 +3691,7 @@ fn function_block_instance_call() { END_PROGRAM ", ); - - let expected = r#"; ModuleID = 'main' -source_filename = "main" - -%prg_interface = type { %foo_interface } -%foo_interface = type {} - -@prg_instance = global %prg_interface zeroinitializer - -define void @foo(%foo_interface* %0) { -entry: - ret void -} - -define void @prg(%prg_interface* %0) { -entry: - %fb_inst = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0 - br label %input - -input: ; preds = %entry - br label %call - -call: ; preds = %input - call void @foo(%foo_interface* %fb_inst) - br label %output - -output: ; preds = %call - br label %continue - -continue: ; preds = %output - ret void -} -"#; - - assert_eq!(result, expected); + insta::assert_snapshot!(result); } #[test] @@ -3874,7 +3833,7 @@ fn structs_are_generated() { " TYPE MyStruct: STRUCT a: DINT; - b: DINT; + b: INT; END_STRUCT END_TYPE diff --git a/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__function_block_instance_call.snap b/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__function_block_instance_call.snap new file mode 100644 index 0000000000..96fc55e36e --- /dev/null +++ b/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__function_block_instance_call.snap @@ -0,0 +1,39 @@ +--- +source: src/codegen/tests/code_gen_tests.rs +expression: result + +--- +; ModuleID = 'main' +source_filename = "main" + +%prg_interface = type { %foo_interface } +%foo_interface = type { i16, i16 } + +@prg_instance = global %prg_interface zeroinitializer + +define void @foo(%foo_interface* %0) { +entry: + %x = getelementptr inbounds %foo_interface, %foo_interface* %0, i32 0, i32 0 + %y = getelementptr inbounds %foo_interface, %foo_interface* %0, i32 0, i32 1 + ret void +} + +define void @prg(%prg_interface* %0) { +entry: + %fb_inst = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0 + br label %input + +input: ; preds = %entry + br label %call + +call: ; preds = %input + call void @foo(%foo_interface* %fb_inst) + br label %output + +output: ; preds = %call + br label %continue + +continue: ; preds = %output + ret void +} + diff --git a/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__program_with_variables_generates_void_function_and_struct.snap b/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__program_with_variables_generates_void_function_and_struct.snap new file mode 100644 index 0000000000..cd7c5f6053 --- /dev/null +++ b/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__program_with_variables_generates_void_function_and_struct.snap @@ -0,0 +1,19 @@ +--- +source: src/codegen/tests/code_gen_tests.rs +expression: result + +--- +; ModuleID = 'main' +source_filename = "main" + +%prg_interface = type { i32, i32 } + +@prg_instance = global %prg_interface zeroinitializer + +define void @prg(%prg_interface* %0) { +entry: + %x = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0 + %y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1 + ret void +} + diff --git a/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__structs_are_generated.snap b/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__structs_are_generated.snap index 3fc019745b..4e9331d215 100644 --- a/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__structs_are_generated.snap +++ b/src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__structs_are_generated.snap @@ -6,7 +6,7 @@ expression: result ; ModuleID = 'main' source_filename = "main" -%MyStruct = type { i32, i32 } +%MyStruct = type { i32, i16 } %__global_y = type { i8, i8 } @x = global %MyStruct zeroinitializer diff --git a/src/codegen/tests/snapshots/rusty__codegen__tests__string_tests__simple_string_test.snap b/src/codegen/tests/snapshots/rusty__codegen__tests__string_tests__simple_string_test.snap new file mode 100644 index 0000000000..8b1a844a79 --- /dev/null +++ b/src/codegen/tests/snapshots/rusty__codegen__tests__string_tests__simple_string_test.snap @@ -0,0 +1,11 @@ +--- +source: src/codegen/tests/string_tests.rs +expression: result + +--- +; ModuleID = 'main' +source_filename = "main" + +@str = global [21 x i8] zeroinitializer +@wstr = global [21 x i16] zeroinitializer + diff --git a/src/codegen/tests/string_tests.rs b/src/codegen/tests/string_tests.rs index 83067fe850..81d6c67fc3 100644 --- a/src/codegen/tests/string_tests.rs +++ b/src/codegen/tests/string_tests.rs @@ -23,6 +23,20 @@ END_PROGRAM insta::assert_snapshot!(result); } +#[test] +fn simple_string_test() { + let result = codegen( + r" +VAR_GLOBAL + str: STRING[20]; + wstr: WSTRING[20]; +END_VAR + ", + ); + + insta::assert_snapshot!(result); +} + #[test] fn program_with_casted_string_assignment() { let result = codegen( diff --git a/src/resolver/tests/resolve_expressions_tests.rs b/src/resolver/tests/resolve_expressions_tests.rs index b5f088f847..1e980e9c0d 100644 --- a/src/resolver/tests/resolve_expressions_tests.rs +++ b/src/resolver/tests/resolve_expressions_tests.rs @@ -96,7 +96,7 @@ fn binary_expressions_resolves_types_for_literals_directly() { { // a --> BYTE (DINT hint) assert_type_and_hint!(&annotations, &index, a, BYTE_TYPE, Some(DINT_TYPE)); - // 7 --> DINT (BYTE hint) + // 7 --> DINT (no hint) assert_type_and_hint!(&annotations, &index, seven, DINT_TYPE, None); } else { unreachable!() From 2e63f43d4b33f2e4510d96d3d8febdb4527878f8 Mon Sep 17 00:00:00 2001 From: Ghaith Hachem Date: Fri, 7 Jan 2022 08:25:49 +0100 Subject: [PATCH 2/2] Fixed some typos --- book/src/arch/architecture.md | 2 +- book/src/arch/codegen.md | 2 +- book/src/arch/parser.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/book/src/arch/architecture.md b/book/src/arch/architecture.md index e827deb099..9426be95c4 100644 --- a/book/src/arch/architecture.md +++ b/book/src/arch/architecture.md @@ -2,7 +2,7 @@ ## Overview -Rusty is a Compiler for Structured Text. It utilizes the llvm compiler infrastructurue and contributes a [Structured Text](https://en.wikipedia.org/wiki/Structured_text) Frontend that translates Structured Text into llvm's language independent intermediate representatin (IR). The Further optimization and native code generation is performed by the existing LLVM infrastructure, namely llvm's common optimizer and the platform specific backend (see [here](https://www.aosabook.org/en/llvm.html)). +Rusty is a Compiler for Structured Text. It utilizes the llvm compiler infrastructurue and contributes a [Structured Text](https://en.wikipedia.org/wiki/Structured_text) Frontend that translates Structured Text into llvm's language independent intermediate representation (IR). The Further optimization and native code generation is performed by the existing LLVM infrastructure, namely llvm's common optimizer and the platform specific backend (see [here](https://www.aosabook.org/en/llvm.html)). ```ignore diff --git a/book/src/arch/codegen.md b/book/src/arch/codegen.md index 86c9fcf25f..830da92643 100644 --- a/book/src/arch/codegen.md +++ b/book/src/arch/codegen.md @@ -1,7 +1,7 @@ # Code-Generation The codegen module contains all code that turns the parsed and verified code represented as an AST into [llvm-ir](https://llvm.org/docs/LangRef.html) code. To generate the *IR* we use a crate that wraps the native llvm [C-API](https://github.com/TheDan64/inkwell). -The code-generator is basically a transformation from the ST-AST into an IR-Tree representation. Therefore the AST is traversed ina visitor-like way and transformed simultaneously. The code generation is split into specialized sub-generators for different tasks: +The code-generator is basically a transformation from the ST-AST into an IR-Tree representation. Therefore the AST is traversed in a visitor-like way and transformed simultaneously. The code generation is split into specialized sub-generators for different tasks: | Generator | Responsibilities | |--------------------|------------------| diff --git a/book/src/arch/parser.md b/book/src/arch/parser.md index 4c489ab456..8d0a7b7f99 100644 --- a/book/src/arch/parser.md +++ b/book/src/arch/parser.md @@ -49,7 +49,7 @@ The parser takes the token stream and creates the corresponding AST that represe The abstract syntax tree is a tree representation of the source code. Some parser implementations use a generic tree-data-structure consisting of `Nodes` which can have an arbitrary number of children. These nodes usually have dynamic properties like a type and an optional value and sometimes they even have dynamic properties stored in a map to make this representation even more flexible. -While this approach needs very little source code we decided to favour a less flexible approach. The rusty-AST models every single ast-node as its own *struct* with all necessary fields including the possible child-nodes. While this approach needs much more code and much mor hand-written changes, its benefits lie in the clearness and simplicity of the data-structure. Every element of the AST is easily identified, debugged and understood. (e.g. while in a generic node based AST it is easily possible to have a binary-statement with no, one, or seven child-nodes, the rusty-AST enforces the structure of every node. So the rusty-Binary-Statement has exactly two children. It is impossible to construct it differently.). +While this approach needs very little source code we decided to favour a less flexible approach. The rusty-AST models every single ast-node as its own *struct* with all necessary fields including the possible child-nodes. While this approach needs much more code and hand-written changes, its benefits lie in the clearness and simplicity of the data-structure. Every element of the AST is easily identified, debugged and understood. (e.g. while in a generic node based AST it is easily possible to have a binary-statement with no, one, or seven child-nodes, the rusty-AST enforces the structure of every node. So the rusty-Binary-Statement has exactly two children. It is impossible to construct it differently.). #### Example So an assignment `a := 3;` will be parsed with the help of the following Structures: