diff --git a/index.js b/index.js index 38d7e04..76cf79a 100644 --- a/index.js +++ b/index.js @@ -1,19 +1,16 @@ require('apollojs'); - var entities = require('entities'); /** * Node Class as base class for TextNode and HTMLElement. */ function Node() { - } -$declare(Node, { -}); +$declare(Node, {}); $defenum(Node, { - ELEMENT_NODE: 1, - TEXT_NODE: 3 + ELEMENT_NODE: 1, + TEXT_NODE: 3 }); /** @@ -21,47 +18,19 @@ $defenum(Node, { * @param {string} value [description] */ function TextNode(value) { - this.rawText = value; + this.rawText = value; } $inherit(TextNode, Node, { - - /** - * Node Type declaration. - * @type {Number} - */ - nodeType: Node.TEXT_NODE, - - /** - * Get unescaped text value of current node and its children. - * @return {string} text content - */ - get text() { - return entities.decodeHTML5(this.rawText); - }, - - /** - * Detect if the node contains only white space. - * @return {bool} - */ - get isWhitespace() { - return /^(\s| )*$/.test(this.rawText); - } - + /** + * Node Type declaration. + * @type {Number} + */ + nodeType: Node.TEXT_NODE, + get text() { + return entities.decodeHTML5(this.rawText); + } }); -var kBlockElements = { - div: true, - p: true, - // ul: true, - // ol: true, - li: true, - // table: true, - // tr: true, - td: true, - section: true, - br: true -}; - /** * HTMLElement, which contains a set of children. * Note: this is a minimalist implementation, no complete tree @@ -72,432 +41,326 @@ var kBlockElements = { * @param {Object} rawAttrs attributes in string */ function HTMLElement(name, keyAttrs, rawAttrs) { - this.tagName = name; - this.rawAttrs = rawAttrs || ''; - // this.parentNode = null; - this.childNodes = []; - if (keyAttrs.id) - this.id = keyAttrs.id; - if (keyAttrs.class) - this.classNames = keyAttrs.class.split(/\s+/); - else - this.classNames = []; + this.tagName = name; + this.rawAttrs = rawAttrs || ''; + this.childNodes = []; + if (keyAttrs.id) { + this.id = keyAttrs.id; + } } $inherit(HTMLElement, Node, { - - /** - * Node Type declaration. - * @type {Number} - */ - nodeType: Node.ELEMENT_NODE, - - /** - * Get unescaped text value of current node and its children. - * @return {string} text content - */ - get text() { - return entities.decodeHTML5(this.rawText); - }, - - /** - * Get escpaed (as-it) text value of current node and its children. - * @return {string} text content - */ - get rawText() { - var res = ''; - for (var i = 0; i < this.childNodes.length; i++) - res += this.childNodes[i].rawText; - return res; - }, - - /** - * Get structured Text (with '\n' etc.) - * @return {string} structured text - */ - get structuredText() { - var currentBlock = []; - var blocks = [currentBlock]; - function dfs(node) { - if (node.nodeType === Node.ELEMENT_NODE) { - if (kBlockElements[node.tagName]) { - if (currentBlock.length > 0) - blocks.push(currentBlock = []); - node.childNodes.forEach(dfs); - if (currentBlock.length > 0) - blocks.push(currentBlock = []); - } else { - node.childNodes.forEach(dfs); + /** + * Node Type declaration. + * @type {Number} + */ + nodeType: Node.ELEMENT_NODE, + /** + * Get unescaped text value of current node and its children. + * @return {string} text content + */ + get text() { + return entities.decodeHTML5(this.rawText); + }, + + /** + * Get escpaed (as-it) text value of current node and its children. + * @return {string} text content + */ + get rawText() { + var res = ''; + for (var i = 0; i < this.childNodes.length; i++) { + res += this.childNodes[i].rawText; } - } else if (node.nodeType === Node.TEXT_NODE) { - if (node.isWhitespace) { - // Whitespace node, postponed output - currentBlock.prependWhitespace = true; - } else { - var text = node.text; - if (currentBlock.prependWhitespace) { - text = ' ' + text; - currentBlock.prependWhitespace = false; - } - currentBlock.push(text); + return res; + }, + + /** + * Append a child node to childNodes + * @param {Node} node node to append + * @return {Node} node appended + */ + appendChild: function (node) { + // node.parentNode = this; + this.childNodes.push(node); + return node; + }, + + get firstChild() { + return this.childNodes.front; + }, + + /** + * Get last child node + * @return {Node} last child node + */ + get lastChild() { + return this.childNodes.back; + }, + /** + * Get attributes + * @return {Object} parsed and unescaped attributes + */ + get attributes() { + if (this._attrs) { + return this._attrs; } - } - } - dfs(this); - return blocks - .map(function(block) { - // Normalize each line's whitespace - return block.join('').trim().replace(/\s{2,}/g, ' '); - }) - .join('\n').trimRight(); - }, - - /** - * Trim element from right (in block) after seeing pattern in a TextNode. - * @param {RegExp} pattern pattern to find - * @return {HTMLElement} reference to current node - */ - trimRight: function(pattern) { - function dfs(node) { - for (var i = 0; i < node.childNodes.length; i++) { - var childNode = node.childNodes[i]; - if (childNode.nodeType === Node.ELEMENT_NODE) { - dfs(childNode); - } else { - var index = childNode.rawText.search(pattern); - if (index > -1) { - childNode.rawText = childNode.rawText.substr(0, index); - // trim all following nodes. - node.childNodes.length = i+1; - } + this._attrs = {}; + var attrs = this.rawAttributes; + for (var key in attrs) { + this._attrs[key] = entities.decodeHTML5(attrs[key]); } - } - } - dfs(this); - return this; - }, - - /** - * Get DOM structure - * @return {string} strucutre - */ - get structure() { - var res = []; - var indention = 0; - function write(str) { - res.push(' '.repeat(indention) + str); - } - function dfs(node) { - var idStr = node.id ? ('#' + node.id) : ''; - var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : ''; - write(node.tagName + idStr + classStr); - indention++; - for (var i = 0; i < node.childNodes.length; i++) { - var childNode = node.childNodes[i]; - if (childNode.nodeType === Node.ELEMENT_NODE) { - dfs(childNode); - } else if (childNode.nodeType === Node.TEXT_NODE) { - if (!childNode.isWhitespace) - write('#text'); + return this._attrs; + }, + + /** + * Get escaped (as-it) attributes + * @return {Object} parsed attributes + */ + get rawAttributes() { + if (this._rawAttrs) { + return this._rawAttrs; } - } - indention--; - } - dfs(this); - return res.join('\n'); - }, - - /** - * Remove whitespaces in this sub tree. - * @return {HTMLElement} pointer to this - */ - removeWhitespace: function() { - var i = 0, o = 0; - for (; i < this.childNodes.length; i++) { - var node = this.childNodes[i]; - if (node.nodeType === Node.TEXT_NODE) { - if (node.isWhitespace) - continue; - node.rawText = node.rawText.trim(); - } else if (node.nodeType === Node.ELEMENT_NODE) { - node.removeWhitespace(); - } - this.childNodes[o++] = node; - } - this.childNodes.length = o; - return this; - }, - - /** - * Query CSS selector to find matching nodes. - * @param {string} selector Simplified CSS selector - * @param {Matcher} selector A Matcher instance - * @return {HTMLElement[]} matching elements - */ - querySelectorAll: function(selector) { - var matcher; - if (selector instanceof Matcher) { - matcher = selector; - matcher.reset(); - } else { - matcher = new Matcher(selector); - } - var res = []; - var stack = []; - for (var i = 0; i < this.childNodes.length; i++) { - stack.push([this.childNodes[i], 0, false]); - while (stack.length) { - var state = stack.back; - var el = state[0]; - if (state[1] === 0) { - // Seen for first time. - if (el.nodeType !== Node.ELEMENT_NODE) { - stack.pop(); - continue; - } - if (state[2] = matcher.advance(el)) { - if (matcher.matched) { - res.push(el); - // no need to go further. - matcher.rewind(); - stack.pop(); - continue; + var attrs = {}; + if (this.rawAttrs) { + var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; + for (var match; match = re.exec(this.rawAttrs);) { + attrs[match[1]] = match[3] || match[4] || match[5]; } - } } - if (state[1] < el.childNodes.length) { - stack.push([el.childNodes[state[1]++], 0, false]); - } else { - if (state[2]) - matcher.rewind(); - stack.pop(); - } - } + this._rawAttrs = attrs; + return attrs; } - return res; - }, - - /** - * Query CSS Selector to find matching node. - * @param {string} selector Simplified CSS selector - * @param {Matcher} selector A Matcher instance - * @return {HTMLElement} matching node - */ - querySelector: function(selector) { - var matcher; - if (selector instanceof Matcher) { - matcher = selector; - matcher.reset(); - } else { - matcher = new Matcher(selector); +}); +$define(HTMLElement, {}); + +// parser statuses +var INITIAL = 'INITIAL'; +var TAG_OPENED = 'TAG_OPENED'; +var READ_TAG_NAME = 'READ_TAG_NAME'; +var READ_ATTRIBUTES = 'READ_ATTRIBUTES'; +var IS_SELF_CLOSING = 'IS_SELF_CLOSING'; +var TAG_CLOSE = 'TAG_CLOSE'; + +// indexes for match +var TAG_FULL = 0; +var TAG_CLOSING_CHAR = 1; +var TAG_NAME = 2; +var ATTRIBUTES = 3; +var SELF_CLOSING_CHAR = 4; + +/** + * Function, which works like RegExp (contains exec function and lastIndex param) + */ +var kMarkupPattern = (function () { + var lastIndex = 0; + + /** + * Make state object from params + * @param {string} status next parser status + * @param {Array} match array which contains parsed values + * @param {Number} openedBracketCounter counter of unclosed tag brackets + */ + function makeState(status, match, openedBracketCounter) { + return { + status: status, + match: match, + openedBracketCounter: openedBracketCounter + }; } - var stack = []; - for (var i = 0; i < this.childNodes.length; i++) { - stack.push([this.childNodes[i], 0, false]); - while (stack.length) { - var state = stack.back; - var el = state[0]; - if (state[1] === 0) { - // Seen for first time. - if (el.nodeType !== Node.ELEMENT_NODE) { - stack.pop(); - continue; - } - if (state[2] = matcher.advance(el)) { - if (matcher.matched) { - return el; - } - } + + // Status functions those determine the processing of the next character + + /** + * Initial status function + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + * @param {Number} index index of processed character + */ + function initial(match, sym, index) { + if (sym === '<') { + match['index'] = index; + return makeState(TAG_OPENED, match, 1); } - if (state[1] < el.childNodes.length) { - stack.push([el.childNodes[state[1]++], 0, false]); + + return makeState(INITIAL, match, 0); + } + + /** + * Processing of the character immediately following the opening bracket + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + */ + function tagOpened(match, sym) { + if (sym === '/') { + match[TAG_CLOSING_CHAR] = '/'; } else { - if (state[2]) - matcher.rewind(); - stack.pop(); + match[TAG_NAME] += sym; } - } + + return makeState(READ_TAG_NAME, match, 1); } - return null; - }, - - /** - * Append a child node to childNodes - * @param {Node} node node to append - * @return {Node} node appended - */ - appendChild: function(node) { - // node.parentNode = this; - this.childNodes.push(node); - return node; - }, - - /** - * Get first child node - * @return {Node} first child node - */ - get firstChild() { - return this.childNodes.front; - }, - - /** - * Get last child node - * @return {Node} last child node - */ - get lastChild() { - return this.childNodes.back; - }, - - /** - * Get attributes - * @return {Object} parsed and unescaped attributes - */ - get attributes() { - if (this._attrs) - return this._attrs; - this._attrs = {}; - var attrs = this.rawAttributes; - for (var key in attrs) { - this._attrs[key] = entities.decodeHTML5(attrs[key]); + + /** + * Processing of the characters in tag name + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + */ + function readTagName(match, sym) { + switch (sym) { + case ' ': + return makeState(READ_ATTRIBUTES, match, 1); + case '/': + return makeState(IS_SELF_CLOSING, match, 1); + case '>': + return makeState(TAG_CLOSE, match, 1); + default: + match[TAG_NAME] += sym; + return makeState(READ_TAG_NAME, match, 1); + } } - return this._attrs; - }, - - /** - * Get escaped (as-it) attributes - * @return {Object} parsed attributes - */ - get rawAttributes() { - if (this._rawAttrs) - return this._rawAttrs; - var attrs = {}; - if (this.rawAttrs) { - var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; - for (var match; match = re.exec(this.rawAttrs); ) - attrs[match[1]] = match[3] || match[4] || match[5]; + + /** + * Processing of the characters in attributes + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + * @param {Number} openedBracketCounter counter of unclosed tag brackets + */ + function readAttributes(match, sym, openedBracketCounter) { + switch (sym) { + case '/': + if (openedBracketCounter === 1) { + return makeState(IS_SELF_CLOSING, match, openedBracketCounter); + } + + match[ATTRIBUTES] += sym; + return makeState(READ_ATTRIBUTES, match, openedBracketCounter); + case '>': + if (--openedBracketCounter) { + match[ATTRIBUTES] += sym; + return makeState(READ_ATTRIBUTES, match, openedBracketCounter); + } + + return makeState(TAG_CLOSE, match, openedBracketCounter); + case '<': + ++openedBracketCounter; + // without break, it's not a mistake + default: + match[ATTRIBUTES] += sym; + return makeState(READ_ATTRIBUTES, match, openedBracketCounter); + } } - this._rawAttrs = attrs; - return attrs; - } -}); -$define(HTMLElement, { - __wrap: function(el) { - el.childNodes.forEach(function(node) { - if (node.rawText) { - $wrap(node, TextNode); - } else { - $wrap(node, HTMLElement); - } - }); - } -}); + /** + * Processing of the character immediately following the character '/' + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + */ + function isSelfClosing(match, sym) { + if (sym === '>') { + match[SELF_CLOSING_CHAR] = '/'; + return makeState(TAG_CLOSE, match, 0); + } -/** - * Cache to store generated match functions - * @type {Object} - */ -var pMatchFunctionCache = {}; + match[ATTRIBUTES] += '/' + sym; + return makeState(READ_ATTRIBUTES, match, 1); + } -/** - * Matcher class to make CSS match - * @param {string} selector Selector - */ -function Matcher(selector) { - this.matchers = selector.split(' ').map(function(matcher) { - if (pMatchFunctionCache[matcher]) - return pMatchFunctionCache[matcher]; - var parts = matcher.split('.'); - var tagName = parts[0]; - var classes = parts.slice(1).sort(); - var source = ''; - if (tagName && tagName != '*') { - if (tagName[0] == '#') - source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;'; - else - source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;'; + /** + * Final processing of the string + * @param {Array} match array which contains parsed values + * @param {string} str processed string + * @param {Number} index index of processed character + */ + function tagClose(match, str, index) { + lastIndex = index; + match[TAG_FULL] = str.slice(match['index'], index); + + return makeState(INITIAL, match, 0); } - if (classes.length > 0) - source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;'; - source += 'return true;'; - return pMatchFunctionCache[matcher] = new Function('el', source); - }); - this.nextMatch = 0; -} -$declare(Matcher, { - /** - * Trying to advance match pointer - * @param {HTMLElement} el element to make the match - * @return {bool} true when pointer advanced. - */ - advance: function(el) { - if (this.nextMatch < this.matchers.length && - this.matchers[this.nextMatch](el)) { - this.nextMatch++; - return true; + + return { + exec: function (str) { + // state.match :: [TAG_FULL, TAG_CLOSING_CHAR, TAG_NAME, ATTRIBUTES, SELF_CLOSING_CHAR] + var state = { + status: INITIAL, + match: ['', '', '', '', ''], + openedBracketCounter: 0 + }; + state.match['input'] = str; + + for (var i = lastIndex; i < str.length; ++i) { + switch (state.status) { + case INITIAL: + state = initial(state.match, str[i], i); + break; + case TAG_OPENED: + state = tagOpened(state.match, str[i]); + break; + case READ_TAG_NAME: + state = readTagName(state.match, str[i]); + break; + case READ_ATTRIBUTES: + state = readAttributes(state.match, str[i], state.openedBracketCounter); + break; + case IS_SELF_CLOSING: + state = isSelfClosing(state.match, str[i]); + break; + case TAG_CLOSE: + state = tagClose(state.match, str, i); + return state.match; + default: + break; + } + } + if (state.status === TAG_CLOSE) { + state = tagClose(state.match, str, str.length); + return state.match; + } + + lastIndex = 0; + return null; + }, + get lastIndex() { + return lastIndex; + }, + set lastIndex(newLastIndex) { + lastIndex = newLastIndex; + } } - return false; - }, - /** - * Rewind the match pointer - */ - rewind: function() { - this.nextMatch--; - }, - /** - * Trying to determine if match made. - * @return {bool} true when the match is made - */ - get matched() { - return this.nextMatch == this.matchers.length; - }, - /** - * Rest match pointer. - * @return {[type]} [description] - */ - reset: function() { - this.nextMatch = 0; - } -}); -$define(Matcher, { - /** - * flush cache to free memory - */ - flushCache: function() { - pMatchFunctionCache = {}; - } -}); +})(); -var kMarkupPattern = /)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig; var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; var kSelfClosingElements = { - meta: true, - img: true, - link: true, - input: true, - area: true, - br: true, - hr: true + meta: true, + img: true, + link: true, + input: true, + area: true, + br: true, + hr: true, + wbr: true, + col: true }; var kElementsClosedByOpening = { - li: {li: true}, - p: {p: true, div: true}, - td: {td: true, th: true}, - th: {td: true, th: true} + li: {li: true}, + p: {p: true, div: true}, + td: {td: true, th: true}, + th: {td: true, th: true} }; var kElementsClosedByClosing = { - li: {ul: true, ol: true}, - a: {div: true}, - b: {div: true}, - i: {div: true}, - p: {div: true}, - td: {tr: true, table: true}, - th: {tr: true, table: true} + li: {ul: true, ol: true}, + a: {div: true}, + b: {div: true}, + i: {div: true}, + p: {div: true}, + td: {tr: true, table: true}, + th: {tr: true, table: true} }; var kBlockTextElements = { - script: true, - noscript: true, - style: true, - pre: true + script: true, + noscript: true, + style: true, + pre: true }; /** @@ -505,103 +368,96 @@ var kBlockTextElements = { */ module.exports = { - Matcher: Matcher, - Node: Node, - HTMLElement: HTMLElement, - TextNode: TextNode, - - /** - * Parse a chuck of HTML source. - * @param {string} data html - * @return {HTMLElement} root element - */ - parse: function(data, options) { - - var root = new HTMLElement(null, {}); - var currentParent = root; - var stack = [root]; - var lastTextPos = -1; - - options = options || {}; - - for (var match, text; match = kMarkupPattern.exec(data); ) { - if (lastTextPos > -1) { - if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { - // if has content - text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); - currentParent.appendChild(new TextNode(text)); - } - } - lastTextPos = kMarkupPattern.lastIndex; - if (match[0][1] == '!') { - // this is a comment - continue; - } - if (options.lowerCaseTagName) - match[2] = match[2].toLowerCase(); - if (!match[1]) { - // not or ... - var closeMarkup = ''; - var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); - if (options[match[2]]) { - if (index == -1) { - // there is no matching ending for the text element. - text = data.substr(kMarkupPattern.lastIndex); - } else { - text = data.substring(kMarkupPattern.lastIndex, index); + Node: Node, + HTMLElement: HTMLElement, + TextNode: TextNode, + + /** + * Parse a chuck of HTML source. + * @param {string} data html + * @return {HTMLElement} root element + */ + parse: function (data, options) { + var root = new HTMLElement(null, {}); + var currentParent = root; + var stack = [root]; + var lastTextPos = -1; + + options = options || {}; + + for (var match, text; match = kMarkupPattern.exec(data);) { + if (lastTextPos > -1 && (lastTextPos + match[TAG_FULL].length < kMarkupPattern.lastIndex)) { + // if has content + text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); + currentParent.appendChild(new TextNode(text)); } - if (text.length > 0) - currentParent.appendChild(new TextNode(text)); - } - if (index == -1) { - lastTextPos = kMarkupPattern.lastIndex = data.length + 1; - } else { - lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; - match[1] = true; - } - } - } - if (match[1] || match[4] || - kSelfClosingElements[match[2]]) { - // or
etc. - while (true) { - if (currentParent.tagName == match[2]) { - stack.pop(); - currentParent = stack.back; - break; - } else { - // Trying to close current tag, and move on - if (kElementsClosedByClosing[currentParent.tagName]) { - if (kElementsClosedByClosing[currentParent.tagName][match[2]]) { - stack.pop(); - currentParent = stack.back; + lastTextPos = kMarkupPattern.lastIndex; + if (match[TAG_FULL][1] == '!') { + // this is a comment continue; - } } - // Use aggressive strategy to handle unmatching markups. - break; - } + if (!match[TAG_CLOSING_CHAR]) { + // not or ... + var closeMarkup = ''; + var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); + if (options[match[TAG_NAME]]) { + if (index == -1) { + // there is no matching ending for the text element. + text = data.substr(kMarkupPattern.lastIndex); + } else { + text = data.substring(kMarkupPattern.lastIndex, index); + } + if (text.length > 0) { + currentParent.appendChild(new TextNode(text)); + } + } + if (index == -1) { + lastTextPos = kMarkupPattern.lastIndex = data.length + 1; + } else { + lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; + match[TAG_CLOSING_CHAR] = true; + } + } + } + if (match[TAG_CLOSING_CHAR] || match[SELF_CLOSING_CHAR] || kSelfClosingElements[match[TAG_NAME]]) { + // or
etc. + while (true) { + if (currentParent.tagName == match[TAG_NAME]) { + stack.pop(); + currentParent = stack.back; + break; + } else { + // Trying to close current tag, and move on + if (kElementsClosedByClosing[currentParent.tagName]) { + if (kElementsClosedByClosing[currentParent.tagName][match[TAG_NAME]]) { + stack.pop(); + currentParent = stack.back; + continue; + } + } + // Use aggressive strategy to handle unmatching markups. + break; + } + } + } } - } - } - - return root; - - } + return root; + } }; diff --git a/package.json b/package.json index 5b4fb67..c95eb4b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fast-html-parser", - "version": "1.0.1", + "version": "1.0.3-col", "description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", "main": "index.js", "scripts": { diff --git a/test/html.js b/test/html.js index eaa22df..fa29174 100644 --- a/test/html.js +++ b/test/html.js @@ -6,60 +6,9 @@ var HTMLParser = require('../'); describe('HTML Parser', function() { - var Matcher = HTMLParser.Matcher; var HTMLElement = HTMLParser.HTMLElement; var TextNode = HTMLParser.TextNode; - describe('Matcher', function() { - - it('should match corrent elements', function() { - - var matcher = new Matcher('#id .a a.b *.a.b .a.b * a'); - var MatchesNothingButStarEl = new HTMLElement('_', {}); - var withIdEl = new HTMLElement('p', { id: 'id' }); - var withClassNameEl = new HTMLElement('a', { class: 'a b' }); - - // console.log(util.inspect([withIdEl, withClassNameEl], { - // showHidden: true, - // depth: null - // })); - - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id - matcher.advance(withClassNameEl).should.not.be.ok; // #id - matcher.advance(withIdEl).should.be.ok; // #id - - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a - matcher.advance(withIdEl).should.not.be.ok; // .a - matcher.advance(withClassNameEl).should.be.ok; // .a - - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b - matcher.advance(withIdEl).should.not.be.ok; // a.b - matcher.advance(withClassNameEl).should.be.ok; // a.b - - matcher.advance(withIdEl).should.not.be.ok; // *.a.b - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b - matcher.advance(withClassNameEl).should.be.ok; // *.a.b - - matcher.advance(withIdEl).should.not.be.ok; // .a.b - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b - matcher.advance(withClassNameEl).should.be.ok; // .a.b - - matcher.advance(withIdEl).should.be.ok; // * - matcher.rewind(); - matcher.advance(MatchesNothingButStarEl).should.be.ok; // * - matcher.rewind(); - matcher.advance(withClassNameEl).should.be.ok; // * - - matcher.advance(withIdEl).should.not.be.ok; // a - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a - matcher.advance(withClassNameEl).should.be.ok; // a - - matcher.matched.should.be.ok; - - }); - - }); - var parseHTML = HTMLParser.parse; describe('parse()', function() { @@ -82,9 +31,7 @@ describe('HTML Parser', function() { it('should parse "

" and return root element', function() { - var root = parseHTML('

', { - lowerCaseTagName: true - }); + var root = parseHTML('

'); var div = new HTMLElement('div', {}, ''); var a = div.appendChild(new HTMLElement('a', {}, '')); @@ -143,53 +90,21 @@ describe('HTML Parser', function() { it('should parse "

.." very fast', function() { - for (var i = 0; i < 100; i++) - parseHTML('

'); + for (var i = 0; i < 100; i++) + parseHTML('

'); }); it('should parse "

.." fast', function() { for (var i = 0; i < 100; i++) - parseHTML('

', { - lowerCaseTagName: true - }); - - }); - - }); - - describe('TextNode', function() { - - describe('#isWhitespace', function() { - var node = new TextNode(''); - node.isWhitespace.should.be.ok; - node = new TextNode(' \t'); - node.isWhitespace.should.be.ok; - node = new TextNode(' \t  \t'); - node.isWhitespace.should.be.ok; + parseHTML('

'); }); }); describe('HTMLElement', function() { - describe('#removeWhitespace()', function() { - - it('should remove whitespaces while preserving nodes with content', function() { - - var root = parseHTML('

\r \n \t

123

'); - - var p = new HTMLElement('p', {}, ''); - p.appendChild(new HTMLElement('h5', {}, '')) - .appendChild(new TextNode('123')); - - root.firstChild.removeWhitespace().should.eql(p); - - }); - - }); - describe('#rawAttributes', function() { it('should return escaped attributes of the element', function() { @@ -222,34 +137,6 @@ describe('HTML Parser', function() { }); - describe('#querySelectorAll()', function() { - - it('should return correct elements in DOM tree', function() { - - var root = parseHTML('
'); - - root.querySelectorAll('#id').should.eql([root.firstChild]); - root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes); - - }); - - }); - - describe('#structuredText', function() { - - it('should return correct structured text', function() { - - var root = parseHTML('o

a

b

c
'); - root.structuredText.should.eql('o\na\nb\nc'); - - }); - - }); - }); });