From dd1ed035b40e726ac8d5583605dea2276b92d684 Mon Sep 17 00:00:00 2001 From: Eric Norris Date: Fri, 27 Jan 2017 00:19:15 -0500 Subject: [PATCH] Rewrite core module. This version includes the ability for a streaming mode where text is pumped into a function and plaintext comes out. It should still be compatible with Node v4+ as it avoids using default parameters or object destructuring. --- striptags.js | 363 +++++++++++++++++++++++---------------------------- 1 file changed, 160 insertions(+), 203 deletions(-) diff --git a/striptags.js b/striptags.js index fa34b0a..ccd9650 100644 --- a/striptags.js +++ b/striptags.js @@ -1,258 +1,215 @@ 'use strict'; -(function (root, factory) { - if (typeof define === 'function' && define.amd) { - // AMD. Register as an anonymous module. - define([], factory); - } else if (typeof module === 'object' && module.exports) { - // Node. Does not work with strict CommonJS, but - // only CommonJS-like environments that support module.exports, - // like Node. - module.exports = factory(); - } else { - // Browser globals (root is window) - root.striptags = factory(); - } -}(this, function () { - - var STATE_OUTPUT = 0, - STATE_HTML = 1, - STATE_PRE_COMMENT = 2, - STATE_COMMENT = 3, - WHITESPACE = /\s/, - ALLOWED_TAGS_REGEX = /<(\w*)>/g; - - function striptags(html, allowableTags, tagReplacement) { - var html = html || '', - state = STATE_OUTPUT, - depth = 0, - output = '', - tagBuffer = '', - inQuote = false, - i, length, c; - - if (typeof allowableTags === 'string') { - // Parse the string into an array of tags - allowableTags = parseAllowableTags(allowableTags); - } else if (!Array.isArray(allowableTags)) { - // If it is not an array, explicitly set to null - allowableTags = null; - } +(function (global) { - for (i = 0, length = html.length; i < length; i++) { - c = html[i]; + const STATE_PLAINTEXT = Symbol('plaintext'); + const STATE_HTML = Symbol('html'); + const STATE_COMMENT = Symbol('comment'); - switch (c) { - case '<': { - // ignore '<' if inside a quote - if (inQuote) { - break; - } + const ALLOWED_TAGS_REGEX = /<(\w*)>/g; + const NORMALIZE_TAG_REGEX = /<\/?([^\s\/>]+)/; - // '<' followed by a space is not a valid tag, continue - if (html[i + 1] == ' ') { - consumeCharacter(c); - break; - } + function striptags(html, allowable_tags, tag_replacement) { + html = html || ''; + allowable_tags = allowable_tags || []; + tag_replacement = tag_replacement || ''; - // change to STATE_HTML - if (state == STATE_OUTPUT) { - state = STATE_HTML; + let context = init_context(allowable_tags, tag_replacement); - consumeCharacter(c); - break; - } + return striptags_internal(html, context); + } - // ignore additional '<' characters when inside a tag - if (state == STATE_HTML) { - depth++; - break; - } + function init_striptags_stream(allowable_tags, tag_replacement) { + allowable_tags = allowable_tags || []; + tag_replacement = tag_replacement || ''; - consumeCharacter(c); - break; - } + let context = init_context(allowable_tags, tag_replacement); - case '>': { - // something like this is happening: '<<>>' - if (depth) { - depth--; - break; - } + return function striptags_stream(html) { + return striptags_internal(html || '', context); + }; + } - // ignore '>' if inside a quote - if (inQuote) { - break; - } + striptags.init_streaming_mode = init_striptags_stream; - // an HTML tag was closed - if (state == STATE_HTML) { - inQuote = state = 0; + function init_context(allowable_tags, tag_replacement) { + allowable_tags = parse_allowable_tags(allowable_tags); - if (allowableTags) { - tagBuffer += '>'; - flushTagBuffer(); - } + return { + allowable_tags, + tag_replacement, - break; - } + state : STATE_PLAINTEXT, + tag_buffer : '', + depth : 0, + in_quote_char : '' + }; + } - // '' - if (state == STATE_PRE_COMMENT) { - inQuote = state = 0; - tagBuffer = ''; - break; - } + function striptags_internal(html, context) { + let allowable_tags = context.allowable_tags; + let tag_replacement = context.tag_replacement; - // if last two characters were '--', then end comment - if (state == STATE_COMMENT && - html[i - 1] == '-' && - html[i - 2] == '-') { + let state = context.state; + let tag_buffer = context.tag_buffer; + let depth = context.depth; + let in_quote_char = context.in_quote_char; + let output = ''; - inQuote = state = 0; - tagBuffer = ''; + for (let idx = 0, length = html.length; idx < length; idx++) { + let char = html[idx]; + + if (state === STATE_PLAINTEXT) { + switch (char) { + case '<': + state = STATE_HTML; + tag_buffer += char; break; - } - consumeCharacter(c); - break; + default: + output += char; + break; } + } - // catch both single and double quotes - case '"': - case '\'': { - if (state == STATE_HTML) { - if (inQuote == c) { - // end quote found - inQuote = false; - } else if (!inQuote) { - // start quote only if not already in one - inQuote = c; + else if (state === STATE_HTML) { + switch (char) { + case '<': + // ignore '<' if inside a quote + if (in_quote_char) { + break; } - } - consumeCharacter(c); - break; - } + // we're seeing a nested '<' + depth++; + break; - case '!': { - if (state == STATE_HTML && - html[i - 1] == '<') { + case '>': + // ignore '>' if inside a quote + if (in_quote_char) { + break; + } - // looks like we might be starting a comment - state = STATE_PRE_COMMENT; - break; - } + // something like this is happening: '<<>>' + if (depth) { + depth--; - consumeCharacter(c); - break; - } + break; + } - case '-': { - // if the previous two characters were '!-', this is a comment - if (state == STATE_PRE_COMMENT && - html[i - 1] == '-' && - html[i - 2] == '!') { + // this is closing the tag in tag_buffer + in_quote_char = ''; + state = STATE_PLAINTEXT; + tag_buffer += '>'; - state = STATE_COMMENT; + if (allowable_tags.has(normalize_tag(tag_buffer))) { + output += tag_buffer; + } else { + output += tag_replacement; + } + + tag_buffer = ''; break; - } - consumeCharacter(c); - break; - } + case '"': + case '\'': + // catch both single and double quotes - case 'E': - case 'e': { - // check for DOCTYPE, because it looks like a comment and isn't - if (state == STATE_PRE_COMMENT && - html.substr(i - 6, 7).toLowerCase() == 'doctype') { + if (char === in_quote_char) { + in_quote_char = ''; + } else { + in_quote_char = in_quote_char || char; + } - state = STATE_HTML; + tag_buffer += char; break; - } - consumeCharacter(c); - break; - } + case '-': + if (tag_buffer === '': { - break normalizeTagBuffer; - } + else if (state === STATE_COMMENT) { + switch (char) { + case '>': + if (tag_buffer.slice(-2) == '--') { + // close the comment + state = STATE_PLAINTEXT; + } - case '/': { - nonWhitespaceSeen = true; + tag_buffer = ''; + break; + + default: + tag_buffer += char; break; - } - - default: { - if (!c.match(WHITESPACE)) { - nonWhitespaceSeen = true; - normalized += c; - } else if (nonWhitespaceSeen) { - break normalizeTagBuffer; - } - } } } + } + + // save the context for future iterations + context.state = state; + context.tag_buffer = tag_buffer; + context.depth = depth; + context.in_quote_char = in_quote_char; + + return output; + } + + function parse_allowable_tags(allowable_tags) { + let tags_array = []; + + if (typeof allowable_tags === 'string') { + let match; - if (allowableTags.indexOf(normalized) !== -1) { - output += tagBuffer; - } else if (tagReplacement) { - output += tagReplacement; + while ((match = ALLOWED_TAGS_REGEX.exec(allowable_tags)) !== null) { + tags_array.push(match[1]); } + } - tagBuffer = ''; + else if (typeof allowable_tags[Symbol.iterator] === 'function') { + tags_array = allowable_tags; } - return output; + return new Set(tags_array); } - /** - * Return an array containing tags that are allowed to pass through the - * algorithm. - * - * @param string allowableTags A string of tags to allow (e.g. ""). - * @return array|null An array of allowed tags or null if none. - */ - function parseAllowableTags(allowableTags) { - var tagsArray = [], - match; - - while ((match = ALLOWED_TAGS_REGEX.exec(allowableTags)) !== null) { - tagsArray.push(match[1]); - } + function normalize_tag(tag_buffer) { + let match = NORMALIZE_TAG_REGEX.exec(tag_buffer); - return tagsArray.length !== 0 ? tagsArray : null; + return match ? match[1].toLowerCase() : null; } - return striptags; -})); + if (typeof define === 'function' && define.amd) { + // AMD + define([], striptags); + } else if (typeof module === 'object' && module.exports) { + // Node + module.exports = striptags; + } else { + // Browser + global.striptags = striptags; + } +}(this));