diff --git a/index.js b/index.js
index 38d7e04..76cf79a 100644
--- a/index.js
+++ b/index.js
@@ -1,19 +1,16 @@
require('apollojs');
-
var entities = require('entities');
/**
* Node Class as base class for TextNode and HTMLElement.
*/
function Node() {
-
}
-$declare(Node, {
-});
+$declare(Node, {});
$defenum(Node, {
- ELEMENT_NODE: 1,
- TEXT_NODE: 3
+ ELEMENT_NODE: 1,
+ TEXT_NODE: 3
});
/**
@@ -21,47 +18,19 @@ $defenum(Node, {
* @param {string} value [description]
*/
function TextNode(value) {
- this.rawText = value;
+ this.rawText = value;
}
$inherit(TextNode, Node, {
-
- /**
- * Node Type declaration.
- * @type {Number}
- */
- nodeType: Node.TEXT_NODE,
-
- /**
- * Get unescaped text value of current node and its children.
- * @return {string} text content
- */
- get text() {
- return entities.decodeHTML5(this.rawText);
- },
-
- /**
- * Detect if the node contains only white space.
- * @return {bool}
- */
- get isWhitespace() {
- return /^(\s| )*$/.test(this.rawText);
- }
-
+ /**
+ * Node Type declaration.
+ * @type {Number}
+ */
+ nodeType: Node.TEXT_NODE,
+ get text() {
+ return entities.decodeHTML5(this.rawText);
+ }
});
-var kBlockElements = {
- div: true,
- p: true,
- // ul: true,
- // ol: true,
- li: true,
- // table: true,
- // tr: true,
- td: true,
- section: true,
- br: true
-};
-
/**
* HTMLElement, which contains a set of children.
* Note: this is a minimalist implementation, no complete tree
@@ -72,432 +41,326 @@ var kBlockElements = {
* @param {Object} rawAttrs attributes in string
*/
function HTMLElement(name, keyAttrs, rawAttrs) {
- this.tagName = name;
- this.rawAttrs = rawAttrs || '';
- // this.parentNode = null;
- this.childNodes = [];
- if (keyAttrs.id)
- this.id = keyAttrs.id;
- if (keyAttrs.class)
- this.classNames = keyAttrs.class.split(/\s+/);
- else
- this.classNames = [];
+ this.tagName = name;
+ this.rawAttrs = rawAttrs || '';
+ this.childNodes = [];
+ if (keyAttrs.id) {
+ this.id = keyAttrs.id;
+ }
}
$inherit(HTMLElement, Node, {
-
- /**
- * Node Type declaration.
- * @type {Number}
- */
- nodeType: Node.ELEMENT_NODE,
-
- /**
- * Get unescaped text value of current node and its children.
- * @return {string} text content
- */
- get text() {
- return entities.decodeHTML5(this.rawText);
- },
-
- /**
- * Get escpaed (as-it) text value of current node and its children.
- * @return {string} text content
- */
- get rawText() {
- var res = '';
- for (var i = 0; i < this.childNodes.length; i++)
- res += this.childNodes[i].rawText;
- return res;
- },
-
- /**
- * Get structured Text (with '\n' etc.)
- * @return {string} structured text
- */
- get structuredText() {
- var currentBlock = [];
- var blocks = [currentBlock];
- function dfs(node) {
- if (node.nodeType === Node.ELEMENT_NODE) {
- if (kBlockElements[node.tagName]) {
- if (currentBlock.length > 0)
- blocks.push(currentBlock = []);
- node.childNodes.forEach(dfs);
- if (currentBlock.length > 0)
- blocks.push(currentBlock = []);
- } else {
- node.childNodes.forEach(dfs);
+ /**
+ * Node Type declaration.
+ * @type {Number}
+ */
+ nodeType: Node.ELEMENT_NODE,
+ /**
+ * Get unescaped text value of current node and its children.
+ * @return {string} text content
+ */
+ get text() {
+ return entities.decodeHTML5(this.rawText);
+ },
+
+ /**
+ * Get escpaed (as-it) text value of current node and its children.
+ * @return {string} text content
+ */
+ get rawText() {
+ var res = '';
+ for (var i = 0; i < this.childNodes.length; i++) {
+ res += this.childNodes[i].rawText;
}
- } else if (node.nodeType === Node.TEXT_NODE) {
- if (node.isWhitespace) {
- // Whitespace node, postponed output
- currentBlock.prependWhitespace = true;
- } else {
- var text = node.text;
- if (currentBlock.prependWhitespace) {
- text = ' ' + text;
- currentBlock.prependWhitespace = false;
- }
- currentBlock.push(text);
+ return res;
+ },
+
+ /**
+ * Append a child node to childNodes
+ * @param {Node} node node to append
+ * @return {Node} node appended
+ */
+ appendChild: function (node) {
+ // node.parentNode = this;
+ this.childNodes.push(node);
+ return node;
+ },
+
+ get firstChild() {
+ return this.childNodes.front;
+ },
+
+ /**
+ * Get last child node
+ * @return {Node} last child node
+ */
+ get lastChild() {
+ return this.childNodes.back;
+ },
+ /**
+ * Get attributes
+ * @return {Object} parsed and unescaped attributes
+ */
+ get attributes() {
+ if (this._attrs) {
+ return this._attrs;
}
- }
- }
- dfs(this);
- return blocks
- .map(function(block) {
- // Normalize each line's whitespace
- return block.join('').trim().replace(/\s{2,}/g, ' ');
- })
- .join('\n').trimRight();
- },
-
- /**
- * Trim element from right (in block) after seeing pattern in a TextNode.
- * @param {RegExp} pattern pattern to find
- * @return {HTMLElement} reference to current node
- */
- trimRight: function(pattern) {
- function dfs(node) {
- for (var i = 0; i < node.childNodes.length; i++) {
- var childNode = node.childNodes[i];
- if (childNode.nodeType === Node.ELEMENT_NODE) {
- dfs(childNode);
- } else {
- var index = childNode.rawText.search(pattern);
- if (index > -1) {
- childNode.rawText = childNode.rawText.substr(0, index);
- // trim all following nodes.
- node.childNodes.length = i+1;
- }
+ this._attrs = {};
+ var attrs = this.rawAttributes;
+ for (var key in attrs) {
+ this._attrs[key] = entities.decodeHTML5(attrs[key]);
}
- }
- }
- dfs(this);
- return this;
- },
-
- /**
- * Get DOM structure
- * @return {string} strucutre
- */
- get structure() {
- var res = [];
- var indention = 0;
- function write(str) {
- res.push(' '.repeat(indention) + str);
- }
- function dfs(node) {
- var idStr = node.id ? ('#' + node.id) : '';
- var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : '';
- write(node.tagName + idStr + classStr);
- indention++;
- for (var i = 0; i < node.childNodes.length; i++) {
- var childNode = node.childNodes[i];
- if (childNode.nodeType === Node.ELEMENT_NODE) {
- dfs(childNode);
- } else if (childNode.nodeType === Node.TEXT_NODE) {
- if (!childNode.isWhitespace)
- write('#text');
+ return this._attrs;
+ },
+
+ /**
+ * Get escaped (as-it) attributes
+ * @return {Object} parsed attributes
+ */
+ get rawAttributes() {
+ if (this._rawAttrs) {
+ return this._rawAttrs;
}
- }
- indention--;
- }
- dfs(this);
- return res.join('\n');
- },
-
- /**
- * Remove whitespaces in this sub tree.
- * @return {HTMLElement} pointer to this
- */
- removeWhitespace: function() {
- var i = 0, o = 0;
- for (; i < this.childNodes.length; i++) {
- var node = this.childNodes[i];
- if (node.nodeType === Node.TEXT_NODE) {
- if (node.isWhitespace)
- continue;
- node.rawText = node.rawText.trim();
- } else if (node.nodeType === Node.ELEMENT_NODE) {
- node.removeWhitespace();
- }
- this.childNodes[o++] = node;
- }
- this.childNodes.length = o;
- return this;
- },
-
- /**
- * Query CSS selector to find matching nodes.
- * @param {string} selector Simplified CSS selector
- * @param {Matcher} selector A Matcher instance
- * @return {HTMLElement[]} matching elements
- */
- querySelectorAll: function(selector) {
- var matcher;
- if (selector instanceof Matcher) {
- matcher = selector;
- matcher.reset();
- } else {
- matcher = new Matcher(selector);
- }
- var res = [];
- var stack = [];
- for (var i = 0; i < this.childNodes.length; i++) {
- stack.push([this.childNodes[i], 0, false]);
- while (stack.length) {
- var state = stack.back;
- var el = state[0];
- if (state[1] === 0) {
- // Seen for first time.
- if (el.nodeType !== Node.ELEMENT_NODE) {
- stack.pop();
- continue;
- }
- if (state[2] = matcher.advance(el)) {
- if (matcher.matched) {
- res.push(el);
- // no need to go further.
- matcher.rewind();
- stack.pop();
- continue;
+ var attrs = {};
+ if (this.rawAttrs) {
+ var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
+ for (var match; match = re.exec(this.rawAttrs);) {
+ attrs[match[1]] = match[3] || match[4] || match[5];
}
- }
}
- if (state[1] < el.childNodes.length) {
- stack.push([el.childNodes[state[1]++], 0, false]);
- } else {
- if (state[2])
- matcher.rewind();
- stack.pop();
- }
- }
+ this._rawAttrs = attrs;
+ return attrs;
}
- return res;
- },
-
- /**
- * Query CSS Selector to find matching node.
- * @param {string} selector Simplified CSS selector
- * @param {Matcher} selector A Matcher instance
- * @return {HTMLElement} matching node
- */
- querySelector: function(selector) {
- var matcher;
- if (selector instanceof Matcher) {
- matcher = selector;
- matcher.reset();
- } else {
- matcher = new Matcher(selector);
+});
+$define(HTMLElement, {});
+
+// parser statuses
+var INITIAL = 'INITIAL';
+var TAG_OPENED = 'TAG_OPENED';
+var READ_TAG_NAME = 'READ_TAG_NAME';
+var READ_ATTRIBUTES = 'READ_ATTRIBUTES';
+var IS_SELF_CLOSING = 'IS_SELF_CLOSING';
+var TAG_CLOSE = 'TAG_CLOSE';
+
+// indexes for match
+var TAG_FULL = 0;
+var TAG_CLOSING_CHAR = 1;
+var TAG_NAME = 2;
+var ATTRIBUTES = 3;
+var SELF_CLOSING_CHAR = 4;
+
+/**
+ * Function, which works like RegExp (contains exec function and lastIndex param)
+ */
+var kMarkupPattern = (function () {
+ var lastIndex = 0;
+
+ /**
+ * Make state object from params
+ * @param {string} status next parser status
+ * @param {Array} match array which contains parsed values
+ * @param {Number} openedBracketCounter counter of unclosed tag brackets
+ */
+ function makeState(status, match, openedBracketCounter) {
+ return {
+ status: status,
+ match: match,
+ openedBracketCounter: openedBracketCounter
+ };
}
- var stack = [];
- for (var i = 0; i < this.childNodes.length; i++) {
- stack.push([this.childNodes[i], 0, false]);
- while (stack.length) {
- var state = stack.back;
- var el = state[0];
- if (state[1] === 0) {
- // Seen for first time.
- if (el.nodeType !== Node.ELEMENT_NODE) {
- stack.pop();
- continue;
- }
- if (state[2] = matcher.advance(el)) {
- if (matcher.matched) {
- return el;
- }
- }
+
+ // Status functions those determine the processing of the next character
+
+ /**
+ * Initial status function
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ * @param {Number} index index of processed character
+ */
+ function initial(match, sym, index) {
+ if (sym === '<') {
+ match['index'] = index;
+ return makeState(TAG_OPENED, match, 1);
}
- if (state[1] < el.childNodes.length) {
- stack.push([el.childNodes[state[1]++], 0, false]);
+
+ return makeState(INITIAL, match, 0);
+ }
+
+ /**
+ * Processing of the character immediately following the opening bracket
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ */
+ function tagOpened(match, sym) {
+ if (sym === '/') {
+ match[TAG_CLOSING_CHAR] = '/';
} else {
- if (state[2])
- matcher.rewind();
- stack.pop();
+ match[TAG_NAME] += sym;
}
- }
+
+ return makeState(READ_TAG_NAME, match, 1);
}
- return null;
- },
-
- /**
- * Append a child node to childNodes
- * @param {Node} node node to append
- * @return {Node} node appended
- */
- appendChild: function(node) {
- // node.parentNode = this;
- this.childNodes.push(node);
- return node;
- },
-
- /**
- * Get first child node
- * @return {Node} first child node
- */
- get firstChild() {
- return this.childNodes.front;
- },
-
- /**
- * Get last child node
- * @return {Node} last child node
- */
- get lastChild() {
- return this.childNodes.back;
- },
-
- /**
- * Get attributes
- * @return {Object} parsed and unescaped attributes
- */
- get attributes() {
- if (this._attrs)
- return this._attrs;
- this._attrs = {};
- var attrs = this.rawAttributes;
- for (var key in attrs) {
- this._attrs[key] = entities.decodeHTML5(attrs[key]);
+
+ /**
+ * Processing of the characters in tag name
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ */
+ function readTagName(match, sym) {
+ switch (sym) {
+ case ' ':
+ return makeState(READ_ATTRIBUTES, match, 1);
+ case '/':
+ return makeState(IS_SELF_CLOSING, match, 1);
+ case '>':
+ return makeState(TAG_CLOSE, match, 1);
+ default:
+ match[TAG_NAME] += sym;
+ return makeState(READ_TAG_NAME, match, 1);
+ }
}
- return this._attrs;
- },
-
- /**
- * Get escaped (as-it) attributes
- * @return {Object} parsed attributes
- */
- get rawAttributes() {
- if (this._rawAttrs)
- return this._rawAttrs;
- var attrs = {};
- if (this.rawAttrs) {
- var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
- for (var match; match = re.exec(this.rawAttrs); )
- attrs[match[1]] = match[3] || match[4] || match[5];
+
+ /**
+ * Processing of the characters in attributes
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ * @param {Number} openedBracketCounter counter of unclosed tag brackets
+ */
+ function readAttributes(match, sym, openedBracketCounter) {
+ switch (sym) {
+ case '/':
+ if (openedBracketCounter === 1) {
+ return makeState(IS_SELF_CLOSING, match, openedBracketCounter);
+ }
+
+ match[ATTRIBUTES] += sym;
+ return makeState(READ_ATTRIBUTES, match, openedBracketCounter);
+ case '>':
+ if (--openedBracketCounter) {
+ match[ATTRIBUTES] += sym;
+ return makeState(READ_ATTRIBUTES, match, openedBracketCounter);
+ }
+
+ return makeState(TAG_CLOSE, match, openedBracketCounter);
+ case '<':
+ ++openedBracketCounter;
+ // without break, it's not a mistake
+ default:
+ match[ATTRIBUTES] += sym;
+ return makeState(READ_ATTRIBUTES, match, openedBracketCounter);
+ }
}
- this._rawAttrs = attrs;
- return attrs;
- }
-});
-$define(HTMLElement, {
- __wrap: function(el) {
- el.childNodes.forEach(function(node) {
- if (node.rawText) {
- $wrap(node, TextNode);
- } else {
- $wrap(node, HTMLElement);
- }
- });
- }
-});
+ /**
+ * Processing of the character immediately following the character '/'
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ */
+ function isSelfClosing(match, sym) {
+ if (sym === '>') {
+ match[SELF_CLOSING_CHAR] = '/';
+ return makeState(TAG_CLOSE, match, 0);
+ }
-/**
- * Cache to store generated match functions
- * @type {Object}
- */
-var pMatchFunctionCache = {};
+ match[ATTRIBUTES] += '/' + sym;
+ return makeState(READ_ATTRIBUTES, match, 1);
+ }
-/**
- * Matcher class to make CSS match
- * @param {string} selector Selector
- */
-function Matcher(selector) {
- this.matchers = selector.split(' ').map(function(matcher) {
- if (pMatchFunctionCache[matcher])
- return pMatchFunctionCache[matcher];
- var parts = matcher.split('.');
- var tagName = parts[0];
- var classes = parts.slice(1).sort();
- var source = '';
- if (tagName && tagName != '*') {
- if (tagName[0] == '#')
- source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';
- else
- source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';
+ /**
+ * Final processing of the string
+ * @param {Array} match array which contains parsed values
+ * @param {string} str processed string
+ * @param {Number} index index of processed character
+ */
+ function tagClose(match, str, index) {
+ lastIndex = index;
+ match[TAG_FULL] = str.slice(match['index'], index);
+
+ return makeState(INITIAL, match, 0);
}
- if (classes.length > 0)
- source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';
- source += 'return true;';
- return pMatchFunctionCache[matcher] = new Function('el', source);
- });
- this.nextMatch = 0;
-}
-$declare(Matcher, {
- /**
- * Trying to advance match pointer
- * @param {HTMLElement} el element to make the match
- * @return {bool} true when pointer advanced.
- */
- advance: function(el) {
- if (this.nextMatch < this.matchers.length &&
- this.matchers[this.nextMatch](el)) {
- this.nextMatch++;
- return true;
+
+ return {
+ exec: function (str) {
+ // state.match :: [TAG_FULL, TAG_CLOSING_CHAR, TAG_NAME, ATTRIBUTES, SELF_CLOSING_CHAR]
+ var state = {
+ status: INITIAL,
+ match: ['', '', '', '', ''],
+ openedBracketCounter: 0
+ };
+ state.match['input'] = str;
+
+ for (var i = lastIndex; i < str.length; ++i) {
+ switch (state.status) {
+ case INITIAL:
+ state = initial(state.match, str[i], i);
+ break;
+ case TAG_OPENED:
+ state = tagOpened(state.match, str[i]);
+ break;
+ case READ_TAG_NAME:
+ state = readTagName(state.match, str[i]);
+ break;
+ case READ_ATTRIBUTES:
+ state = readAttributes(state.match, str[i], state.openedBracketCounter);
+ break;
+ case IS_SELF_CLOSING:
+ state = isSelfClosing(state.match, str[i]);
+ break;
+ case TAG_CLOSE:
+ state = tagClose(state.match, str, i);
+ return state.match;
+ default:
+ break;
+ }
+ }
+ if (state.status === TAG_CLOSE) {
+ state = tagClose(state.match, str, str.length);
+ return state.match;
+ }
+
+ lastIndex = 0;
+ return null;
+ },
+ get lastIndex() {
+ return lastIndex;
+ },
+ set lastIndex(newLastIndex) {
+ lastIndex = newLastIndex;
+ }
}
- return false;
- },
- /**
- * Rewind the match pointer
- */
- rewind: function() {
- this.nextMatch--;
- },
- /**
- * Trying to determine if match made.
- * @return {bool} true when the match is made
- */
- get matched() {
- return this.nextMatch == this.matchers.length;
- },
- /**
- * Rest match pointer.
- * @return {[type]} [description]
- */
- reset: function() {
- this.nextMatch = 0;
- }
-});
-$define(Matcher, {
- /**
- * flush cache to free memory
- */
- flushCache: function() {
- pMatchFunctionCache = {};
- }
-});
+})();
-var kMarkupPattern = /)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig;
var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
var kSelfClosingElements = {
- meta: true,
- img: true,
- link: true,
- input: true,
- area: true,
- br: true,
- hr: true
+ meta: true,
+ img: true,
+ link: true,
+ input: true,
+ area: true,
+ br: true,
+ hr: true,
+ wbr: true,
+ col: true
};
var kElementsClosedByOpening = {
- li: {li: true},
- p: {p: true, div: true},
- td: {td: true, th: true},
- th: {td: true, th: true}
+ li: {li: true},
+ p: {p: true, div: true},
+ td: {td: true, th: true},
+ th: {td: true, th: true}
};
var kElementsClosedByClosing = {
- li: {ul: true, ol: true},
- a: {div: true},
- b: {div: true},
- i: {div: true},
- p: {div: true},
- td: {tr: true, table: true},
- th: {tr: true, table: true}
+ li: {ul: true, ol: true},
+ a: {div: true},
+ b: {div: true},
+ i: {div: true},
+ p: {div: true},
+ td: {tr: true, table: true},
+ th: {tr: true, table: true}
};
var kBlockTextElements = {
- script: true,
- noscript: true,
- style: true,
- pre: true
+ script: true,
+ noscript: true,
+ style: true,
+ pre: true
};
/**
@@ -505,103 +368,96 @@ var kBlockTextElements = {
*/
module.exports = {
- Matcher: Matcher,
- Node: Node,
- HTMLElement: HTMLElement,
- TextNode: TextNode,
-
- /**
- * Parse a chuck of HTML source.
- * @param {string} data html
- * @return {HTMLElement} root element
- */
- parse: function(data, options) {
-
- var root = new HTMLElement(null, {});
- var currentParent = root;
- var stack = [root];
- var lastTextPos = -1;
-
- options = options || {};
-
- for (var match, text; match = kMarkupPattern.exec(data); ) {
- if (lastTextPos > -1) {
- if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
- // if has content
- text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
- currentParent.appendChild(new TextNode(text));
- }
- }
- lastTextPos = kMarkupPattern.lastIndex;
- if (match[0][1] == '!') {
- // this is a comment
- continue;
- }
- if (options.lowerCaseTagName)
- match[2] = match[2].toLowerCase();
- if (!match[1]) {
- // not tags
- var attrs = {};
- for (var attMatch; attMatch = kAttributePattern.exec(match[3]); )
- attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
- // console.log(attrs);
- if (!match[4] && kElementsClosedByOpening[currentParent.tagName]) {
- if (kElementsClosedByOpening[currentParent.tagName][match[2]]) {
- stack.pop();
- currentParent = stack.back;
- }
- }
- currentParent = currentParent.appendChild(
- new HTMLElement(match[2], attrs, match[3]));
- stack.push(currentParent);
- if (kBlockTextElements[match[2]]) {
- // a little test to find next or ...
- var closeMarkup = '' + match[2] + '>';
- var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
- if (options[match[2]]) {
- if (index == -1) {
- // there is no matching ending for the text element.
- text = data.substr(kMarkupPattern.lastIndex);
- } else {
- text = data.substring(kMarkupPattern.lastIndex, index);
+ Node: Node,
+ HTMLElement: HTMLElement,
+ TextNode: TextNode,
+
+ /**
+ * Parse a chuck of HTML source.
+ * @param {string} data html
+ * @return {HTMLElement} root element
+ */
+ parse: function (data, options) {
+ var root = new HTMLElement(null, {});
+ var currentParent = root;
+ var stack = [root];
+ var lastTextPos = -1;
+
+ options = options || {};
+
+ for (var match, text; match = kMarkupPattern.exec(data);) {
+ if (lastTextPos > -1 && (lastTextPos + match[TAG_FULL].length < kMarkupPattern.lastIndex)) {
+ // if has content
+ text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
+ currentParent.appendChild(new TextNode(text));
}
- if (text.length > 0)
- currentParent.appendChild(new TextNode(text));
- }
- if (index == -1) {
- lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
- } else {
- lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
- match[1] = true;
- }
- }
- }
- if (match[1] || match[4] ||
- kSelfClosingElements[match[2]]) {
- // or /> or
etc.
- while (true) {
- if (currentParent.tagName == match[2]) {
- stack.pop();
- currentParent = stack.back;
- break;
- } else {
- // Trying to close current tag, and move on
- if (kElementsClosedByClosing[currentParent.tagName]) {
- if (kElementsClosedByClosing[currentParent.tagName][match[2]]) {
- stack.pop();
- currentParent = stack.back;
+ lastTextPos = kMarkupPattern.lastIndex;
+ if (match[TAG_FULL][1] == '!') {
+ // this is a comment
continue;
- }
}
- // Use aggressive strategy to handle unmatching markups.
- break;
- }
+ if (!match[TAG_CLOSING_CHAR]) {
+ // not tags
+ var attrs = {};
+ for (var attMatch; attMatch = kAttributePattern.exec(match[ATTRIBUTES]);) {
+ attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
+ }
+ if (
+ !match[SELF_CLOSING_CHAR]
+ && kElementsClosedByOpening[currentParent.tagName]
+ && kElementsClosedByOpening[currentParent.tagName][match[TAG_NAME]]
+ ) {
+ stack.pop();
+ currentParent = stack.back;
+ }
+ currentParent = currentParent.appendChild(new HTMLElement(match[TAG_NAME], attrs, match[ATTRIBUTES]));
+ stack.push(currentParent);
+ if (kBlockTextElements[match[TAG_NAME]]) {
+ // a little test to find next or ...
+ var closeMarkup = '' + match[TAG_NAME] + '>';
+ var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
+ if (options[match[TAG_NAME]]) {
+ if (index == -1) {
+ // there is no matching ending for the text element.
+ text = data.substr(kMarkupPattern.lastIndex);
+ } else {
+ text = data.substring(kMarkupPattern.lastIndex, index);
+ }
+ if (text.length > 0) {
+ currentParent.appendChild(new TextNode(text));
+ }
+ }
+ if (index == -1) {
+ lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
+ } else {
+ lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
+ match[TAG_CLOSING_CHAR] = true;
+ }
+ }
+ }
+ if (match[TAG_CLOSING_CHAR] || match[SELF_CLOSING_CHAR] || kSelfClosingElements[match[TAG_NAME]]) {
+ // or /> or
etc.
+ while (true) {
+ if (currentParent.tagName == match[TAG_NAME]) {
+ stack.pop();
+ currentParent = stack.back;
+ break;
+ } else {
+ // Trying to close current tag, and move on
+ if (kElementsClosedByClosing[currentParent.tagName]) {
+ if (kElementsClosedByClosing[currentParent.tagName][match[TAG_NAME]]) {
+ stack.pop();
+ currentParent = stack.back;
+ continue;
+ }
+ }
+ // Use aggressive strategy to handle unmatching markups.
+ break;
+ }
+ }
+ }
}
- }
- }
-
- return root;
-
- }
+ return root;
+ }
};
diff --git a/package.json b/package.json
index 5b4fb67..c95eb4b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "fast-html-parser",
- "version": "1.0.1",
+ "version": "1.0.3-col",
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
"main": "index.js",
"scripts": {
diff --git a/test/html.js b/test/html.js
index eaa22df..fa29174 100644
--- a/test/html.js
+++ b/test/html.js
@@ -6,60 +6,9 @@ var HTMLParser = require('../');
describe('HTML Parser', function() {
- var Matcher = HTMLParser.Matcher;
var HTMLElement = HTMLParser.HTMLElement;
var TextNode = HTMLParser.TextNode;
- describe('Matcher', function() {
-
- it('should match corrent elements', function() {
-
- var matcher = new Matcher('#id .a a.b *.a.b .a.b * a');
- var MatchesNothingButStarEl = new HTMLElement('_', {});
- var withIdEl = new HTMLElement('p', { id: 'id' });
- var withClassNameEl = new HTMLElement('a', { class: 'a b' });
-
- // console.log(util.inspect([withIdEl, withClassNameEl], {
- // showHidden: true,
- // depth: null
- // }));
-
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id
- matcher.advance(withClassNameEl).should.not.be.ok; // #id
- matcher.advance(withIdEl).should.be.ok; // #id
-
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a
- matcher.advance(withIdEl).should.not.be.ok; // .a
- matcher.advance(withClassNameEl).should.be.ok; // .a
-
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b
- matcher.advance(withIdEl).should.not.be.ok; // a.b
- matcher.advance(withClassNameEl).should.be.ok; // a.b
-
- matcher.advance(withIdEl).should.not.be.ok; // *.a.b
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b
- matcher.advance(withClassNameEl).should.be.ok; // *.a.b
-
- matcher.advance(withIdEl).should.not.be.ok; // .a.b
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b
- matcher.advance(withClassNameEl).should.be.ok; // .a.b
-
- matcher.advance(withIdEl).should.be.ok; // *
- matcher.rewind();
- matcher.advance(MatchesNothingButStarEl).should.be.ok; // *
- matcher.rewind();
- matcher.advance(withClassNameEl).should.be.ok; // *
-
- matcher.advance(withIdEl).should.not.be.ok; // a
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a
- matcher.advance(withClassNameEl).should.be.ok; // a
-
- matcher.matched.should.be.ok;
-
- });
-
- });
-
var parseHTML = HTMLParser.parse;
describe('parse()', function() {
@@ -82,9 +31,7 @@ describe('HTML Parser', function() {
it('should parse "
\r \n \t
a
b
c'); - root.structuredText.should.eql('o\na\nb\nc'); - - }); - - }); - }); });