Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1039 can not parse correctly html with nested ul and li tags #1304

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
710 changes: 710 additions & 0 deletions packages/happy-dom/src/config/HTMLElementConfig.ts

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
enum HTMLElementConfigContentModelEnum {
rawText = 'rawText',
noSelfDescendants = 'noSelfDescendants',
noFirsLevelSelfDescendants = 'noFirsLevelSelfDescendants',
noDescendants = 'noDescendants',
anyDescendants = 'anyDescendants'
}

export default HTMLElementConfigContentModelEnum;
119 changes: 0 additions & 119 deletions packages/happy-dom/src/config/HTMLElementLocalNameToClass.ts

This file was deleted.

4 changes: 0 additions & 4 deletions packages/happy-dom/src/config/HTMLElementPlainText.ts

This file was deleted.

18 changes: 0 additions & 18 deletions packages/happy-dom/src/config/HTMLElementUnnestable.ts

This file was deleted.

16 changes: 0 additions & 16 deletions packages/happy-dom/src/config/HTMLElementVoid.ts

This file was deleted.

8 changes: 8 additions & 0 deletions packages/happy-dom/src/config/IHTMLElementConfigEntity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import HTMLElementConfigContentModelEnum from './HTMLElementConfigContentModelEnum.js';

export default interface IHTMLElementConfigEntity {
className: string;
localName: string;
tagName: string;
contentModel: HTMLElementConfigContentModelEnum;
}
6 changes: 4 additions & 2 deletions packages/happy-dom/src/nodes/document/Document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import DocumentFragment from '../document-fragment/DocumentFragment.js';
import XMLParser from '../../xml-parser/XMLParser.js';
import Event from '../../event/Event.js';
import DOMImplementation from '../../dom-implementation/DOMImplementation.js';
import HTMLElementLocalNameToClass from '../../config/HTMLElementLocalNameToClass.js';
import INodeFilter from '../../tree-walker/INodeFilter.js';
import NamespaceURI from '../../config/NamespaceURI.js';
import DocumentType from '../document-type/DocumentType.js';
Expand Down Expand Up @@ -51,6 +50,7 @@ import ISVGElementTagNameMap from '../../config/ISVGElementTagNameMap.js';
import ISVGElement from '../svg-element/ISVGElement.js';
import IHTMLFormElement from '../html-form-element/IHTMLFormElement.js';
import IHTMLAnchorElement from '../html-anchor-element/IHTMLAnchorElement.js';
import HTMLElementConfig from '../../config/HTMLElementConfig.js';

const PROCESSING_INSTRUCTION_TARGET_REGEXP = /^[a-z][a-z0-9-]+$/;

Expand Down Expand Up @@ -1131,7 +1131,9 @@ export default class Document extends Node implements IDocument {
}

const localName = qualifiedName.toLowerCase();
const elementClass = this[PropertySymbol.ownerWindow][HTMLElementLocalNameToClass[localName]];
const elementClass = HTMLElementConfig[localName]
? this[PropertySymbol.ownerWindow][HTMLElementConfig[localName].className]
: null;

// Known HTML element
if (elementClass) {
Expand Down
77 changes: 38 additions & 39 deletions packages/happy-dom/src/xml-parser/XMLParser.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import IDocument from '../nodes/document/IDocument.js';
import * as PropertySymbol from '../PropertySymbol.js';
import HTMLElementVoid from '../config/HTMLElementVoid.js';
import HTMLElementUnnestable from '../config/HTMLElementUnnestable.js';
import NamespaceURI from '../config/NamespaceURI.js';
import HTMLScriptElement from '../nodes/html-script-element/HTMLScriptElement.js';
import IElement from '../nodes/element/IElement.js';
import HTMLLinkElement from '../nodes/html-link-element/HTMLLinkElement.js';
import HTMLElementPlainText from '../config/HTMLElementPlainText.js';
import IDocumentType from '../nodes/document-type/IDocumentType.js';
import INode from '../nodes/node/INode.js';
import IDocumentFragment from '../nodes/document-fragment/IDocumentFragment.js';
import HTMLElementConfig from '../config/HTMLElementConfig.js';
import * as Entities from 'entities';
import HTMLElementConfigContentModelEnum from '../config/HTMLElementConfigContentModelEnum.js';

/**
* Markup RegExp.
Expand Down Expand Up @@ -58,6 +57,8 @@ const DOCUMENT_TYPE_ATTRIBUTE_REGEXP = /"([^"]+)"/gm;

/**
* XML parser.
*
* @see https://html.spec.whatwg.org/multipage/indices.html
*/
export default class XMLParser {
/**
Expand All @@ -77,12 +78,11 @@ export default class XMLParser {
): IElement | IDocumentFragment | IDocument {
const root = options && options.rootNode ? options.rootNode : document.createDocumentFragment();
const stack: INode[] = [root];
const stackTagNames: string[] = [];
const markupRegexp = new RegExp(MARKUP_REGEXP, 'gm');
const { evaluateScripts = false } = options || {};
const unnestableTagNames: string[] = [];
let currentNode: INode | null = root;
let match: RegExpExecArray;
let plainTextTagName: string | null = null;
let readState: MarkupReadStateEnum = MarkupReadStateEnum.startOrEndTag;
let startTagIndex = 0;
let lastIndex = 0;
Expand All @@ -108,19 +108,31 @@ export default class XMLParser {
// Start tag.
const tagName = match[1].toUpperCase();
const localName = tagName === 'SVG' ? 'svg' : match[1];
const config = HTMLElementConfig[localName];

// Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
// Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
const unnestableTagNameIndex = unnestableTagNames.indexOf(tagName);
if (unnestableTagNameIndex !== -1) {
unnestableTagNames.splice(unnestableTagNameIndex, 1);
if (
config?.contentModel ===
HTMLElementConfigContentModelEnum.noFirsLevelSelfDescendants &&
stackTagNames[stackTagNames.length - 1] === tagName
) {
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
} else if (
config?.contentModel === HTMLElementConfigContentModelEnum.noSelfDescendants &&
stackTagNames.includes(tagName)
) {
while (currentNode !== root) {
if ((<IElement>currentNode)[PropertySymbol.tagName].toUpperCase() === tagName) {
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
break;
}
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
}
}
Expand All @@ -136,25 +148,18 @@ export default class XMLParser {
currentNode.appendChild(newElement);
currentNode = newElement;
stack.push(currentNode);
stackTagNames.push(tagName);
readState = MarkupReadStateEnum.insideStartTag;
startTagIndex = markupRegexp.lastIndex;
} else if (match[2]) {
// End tag.

if (
match[2].toUpperCase() ===
(<IElement>currentNode)[PropertySymbol.tagName].toUpperCase()
(<IElement>currentNode)[PropertySymbol.tagName]?.toUpperCase()
) {
// Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
// Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
const unnestableTagNameIndex = unnestableTagNames.indexOf(
(<IElement>currentNode)[PropertySymbol.tagName].toUpperCase()
);
if (unnestableTagNameIndex !== -1) {
unnestableTagNames.splice(unnestableTagNameIndex, 1);
}

stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
}
} else if (
Expand Down Expand Up @@ -201,8 +206,6 @@ export default class XMLParser {
case MarkupReadStateEnum.insideStartTag:
// End of start tag
if (match[7] || match[8]) {
// End of start tag.

// Attribute name and value.

const attributeString = xml.substring(startTagIndex, match.index);
Expand Down Expand Up @@ -257,33 +260,27 @@ export default class XMLParser {
// We need to check if the attribute string is read completely.
// The attribute string can potentially contain "/>" or ">".
if (hasAttributeStringEnded) {
const config = HTMLElementConfig[(<IElement>currentNode)[PropertySymbol.localName]];

// Checks if the tag is a self closing tag (ends with "/>") or void element.
// When it is a self closing tag or void element it should be closed immediately.
// Self closing tags are not allowed in the HTML namespace, but the parser should still allow it for void elements.
// Self closing tags is supported in the SVG namespace.
if (
HTMLElementVoid[(<IElement>currentNode)[PropertySymbol.tagName]] ||
config?.contentModel === HTMLElementConfigContentModelEnum.noDescendants ||
// SVG tag is self closing (<svg/>).
(match[7] &&
(<IElement>currentNode)[PropertySymbol.namespaceURI] === NamespaceURI.svg)
) {
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
readState = MarkupReadStateEnum.startOrEndTag;
} else {
// Plain text elements such as <script> and <style> should only contain text.
plainTextTagName = HTMLElementPlainText[
(<IElement>currentNode)[PropertySymbol.tagName]
]
? (<IElement>currentNode)[PropertySymbol.tagName]
: null;

readState = !!plainTextTagName
? MarkupReadStateEnum.plainTextContent
: MarkupReadStateEnum.startOrEndTag;

if (HTMLElementUnnestable[(<IElement>currentNode)[PropertySymbol.tagName]]) {
unnestableTagNames.push((<IElement>currentNode)[PropertySymbol.tagName]);
}
readState =
config?.contentModel === HTMLElementConfigContentModelEnum.rawText
? MarkupReadStateEnum.plainTextContent
: MarkupReadStateEnum.startOrEndTag;
}

startTagIndex = markupRegexp.lastIndex;
Expand All @@ -292,15 +289,17 @@ export default class XMLParser {

break;
case MarkupReadStateEnum.plainTextContent:
if (match[2] && match[2].toUpperCase() === plainTextTagName) {
const tagName = currentNode[PropertySymbol.tagName];

if (tagName && match[2] && match[2].toUpperCase() === tagName) {
// End of plain text tag.

// Scripts are not allowed to be executed when they are parsed using innerHTML, outerHTML, replaceWith() etc.
// However, they are allowed to be executed when document.write() is used.
// See: https://developer.mozilla.org/en-US/docs/Web/API/HTMLScriptElement
if (plainTextTagName === 'SCRIPT') {
if (tagName === 'SCRIPT') {
(<HTMLScriptElement>currentNode)[PropertySymbol.evaluateScript] = evaluateScripts;
} else if (plainTextTagName === 'LINK') {
} else if (tagName === 'LINK') {
// An assumption that the same rule should be applied for the HTMLLinkElement is made here.
(<HTMLLinkElement>currentNode)[PropertySymbol.evaluateCSS] = evaluateScripts;
}
Expand All @@ -313,8 +312,8 @@ export default class XMLParser {
);

stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
plainTextTagName = null;
readState = MarkupReadStateEnum.startOrEndTag;
}

Expand Down
Loading
Loading