Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace name and entity regular expressions with specific functions for ~15% performance improvement #216

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 57 additions & 21 deletions lib/sax.js
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,56 @@
// without a significant breaking change to either this parser, or the
// JavaScript language. Implementation of an emoji-capable xml parser
// is left as an exercise for the reader.
var nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/

var nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/
function isNameStartCharCode (cc) {
return (cc >= 97 && cc <= 122) || // a-z
(cc >= 65 && cc <= 90) || // A-Z
cc === 58 || // :
cc === 95 || // _
(cc >= 0x00C0 && cc <= 0x00D6) ||
(cc >= 0x00D8 && cc <= 0x00F6) ||
(cc >= 0x00F8 && cc <= 0x02FF) ||
(cc >= 0x0370 && cc <= 0x037D) ||
(cc >= 0x037F && cc <= 0x1FFF) ||
(cc >= 0x200C && cc <= 0x200D) ||
(cc >= 0x2070 && cc <= 0x218F) ||
(cc >= 0x2C00 && cc <= 0x2FEF) ||
(cc >= 0x3001 && cc <= 0xD7FF) ||
(cc >= 0xF900 && cc <= 0xFDCF) ||
(cc >= 0xFDF0 && cc <= 0xFFFD)
}

function isNameBodyCharCode (cc) {
return isNameStartCharCode(cc) ||
cc === 45 || // -
cc === 46 || // .
(cc >= 48 && cc <= 57) || // 0-9
cc === 0x00B7 ||
(cc >= 0x0300 && cc <= 0x036F) ||
(cc >= 0x203F && cc <= 0x2040)
}

function isNameStart (c) {
var cc = c.charCodeAt(0)
return isNameStartCharCode(cc)
}

function isNameBody (c) {
var cc = c.charCodeAt(0)
return isNameBodyCharCode(cc)
}

var entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/
var entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/
function isEntityStart (c) {
var cc = c.charCodeAt(0)
return cc === 35 || // #
isNameStartCharCode(cc)
}

function isEntityBody (c) {
var cc = c.charCodeAt(0)
return cc === 35 || // #
isNameBodyCharCode(cc)
}

function isWhitespace (c) {
return c === ' ' || c === '\n' || c === '\r' || c === '\t'
Expand All @@ -295,14 +339,6 @@
return c === '>' || isWhitespace(c)
}

function isMatch (regex, c) {
return regex.test(c)
}

function notMatch (regex, c) {
return !isMatch(regex, c)
}

var S = 0
sax.STATE = {
BEGIN: S++, // leading byte order mark or whitespace
Expand Down Expand Up @@ -1067,7 +1103,7 @@
parser.sgmlDecl = ''
} else if (isWhitespace(c)) {
// wait for it...
} else if (isMatch(nameStart, c)) {
} else if (isNameStart(c)) {
parser.state = S.OPEN_TAG
parser.tagName = c
} else if (c === '/') {
Expand Down Expand Up @@ -1270,7 +1306,7 @@
continue

case S.OPEN_TAG:
if (isMatch(nameBody, c)) {
if (isNameBody(c)) {
parser.tagName += c
} else {
newTag(parser)
Expand Down Expand Up @@ -1305,7 +1341,7 @@
openTag(parser)
} else if (c === '/') {
parser.state = S.OPEN_TAG_SLASH
} else if (isMatch(nameStart, c)) {
} else if (isNameStart(c)) {
parser.attribName = c
parser.attribValue = ''
parser.state = S.ATTRIB_NAME
Expand All @@ -1324,7 +1360,7 @@
openTag(parser)
} else if (isWhitespace(c)) {
parser.state = S.ATTRIB_NAME_SAW_WHITE
} else if (isMatch(nameBody, c)) {
} else if (isNameBody(c)) {
parser.attribName += c
} else {
strictFail(parser, 'Invalid attribute name')
Expand All @@ -1347,7 +1383,7 @@
parser.attribName = ''
if (c === '>') {
openTag(parser)
} else if (isMatch(nameStart, c)) {
} else if (isNameStart(c)) {
parser.attribName = c
parser.state = S.ATTRIB_NAME
} else {
Expand Down Expand Up @@ -1391,7 +1427,7 @@
openTag(parser)
} else if (c === '/') {
parser.state = S.OPEN_TAG_SLASH
} else if (isMatch(nameStart, c)) {
} else if (isNameStart(c)) {
strictFail(parser, 'No whitespace between attributes')
parser.attribName = c
parser.attribValue = ''
Expand Down Expand Up @@ -1422,7 +1458,7 @@
if (!parser.tagName) {
if (isWhitespace(c)) {
continue
} else if (notMatch(nameStart, c)) {
} else if (!isNameStart(c)) {
if (parser.script) {
parser.script += '</' + c
parser.state = S.SCRIPT
Expand All @@ -1434,7 +1470,7 @@
}
} else if (c === '>') {
closeTag(parser)
} else if (isMatch(nameBody, c)) {
} else if (isNameBody(c)) {
parser.tagName += c
} else if (parser.script) {
parser.script += '</' + parser.tagName
Expand Down Expand Up @@ -1485,7 +1521,7 @@
parser[buffer] += parseEntity(parser)
parser.entity = ''
parser.state = returnState
} else if (isMatch(parser.entity.length ? entityBody : entityStart, c)) {
} else if (parser.entity.length ? isEntityBody(c) : isEntityStart(c)) {
parser.entity += c
} else {
strictFail(parser, 'Invalid character in entity name')
Expand Down