diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 06d3f008a..9c7fc1c8e 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -98,6 +98,10 @@ function isEndOfTagSection(c: number): boolean { return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); } +function isNumber(c: number): boolean { + return c >= CharCodes.Zero && c <= CharCodes.Nine; +} + function isASCIIAlpha(c: number): boolean { return ( (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || @@ -243,11 +247,7 @@ export default class Tokenizer { this._state = State.BeforeTagName; this.sectionStart = this._index; } else if (this.decodeEntities && c === CharCodes.Amp) { - if (this._index > this.sectionStart) { - this.cbs.ontext(this.getSection()); - } this._state = State.BeforeEntity; - this.sectionStart = this._index; } } @@ -302,11 +302,7 @@ export default class Tokenizer { if (this.currentSequence === Sequences.TitleEnd) { // We have to parse entities in tags. if (this.decodeEntities && c === CharCodes.Amp) { - if (this._index > this.sectionStart) { - this.cbs.ontext(this.getSection()); - } this._state = State.BeforeEntity; - this.sectionStart = this._index; } } else if (this.fastForwardTo(CharCodes.Lt)) { // Outside of <title> tags, we can fast-forward. @@ -546,10 +542,8 @@ export default class Tokenizer { this.cbs.onattribend(String.fromCharCode(quote)); this._state = State.BeforeAttributeName; } else if (this.decodeEntities && c === CharCodes.Amp) { - this.cbs.onattribdata(this.getSection()); this.baseState = this._state; this._state = State.BeforeEntity; - this.sectionStart = this._index; } } private stateInAttributeValueDoubleQuotes(c: number) { @@ -566,10 +560,8 @@ export default class Tokenizer { this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (this.decodeEntities && c === CharCodes.Amp) { - this.cbs.onattribdata(this.getSection()); this.baseState = this._state; this._state = State.BeforeEntity; - this.sectionStart = this._index; } } private stateBeforeDeclaration(c: number) { @@ -636,28 +628,27 @@ export default class Tokenizer { private trieIndex = 0; private trieCurrent = 0; private trieResult: string | null = null; - private trieExcess = 0; + private entityExcess = 0; private stateBeforeEntity(c: number) { + // Start excess with 1 to include the '&' + this.entityExcess = 1; + if (c === CharCodes.Num) { this._state = State.BeforeNumericEntity; } else if (c === CharCodes.Amp) { - // We have two `&` characters in a row. Emit the first one. - this.emitPartial(this.getSection()); - this.sectionStart = this._index; + // We have two `&` characters in a row. Stay in the current state. } else { - this._state = State.InNamedEntity; this.trieIndex = 0; this.trieCurrent = this.entityTrie[0]; this.trieResult = null; - // Start excess with 1 to include the '&' - this.trieExcess = 1; - this._index--; + this._state = State.InNamedEntity; + this.stateInNamedEntity(c); } } private stateInNamedEntity(c: number) { - this.trieExcess += 1; + this.entityExcess += 1; this.trieIndex = determineBranch( this.entityTrie, @@ -681,6 +672,15 @@ export default class Tokenizer { // No need to consider multi-byte values, as the legacy entity is always a single byte this.trieIndex += 1; } else { + // Add 1 as we have already incremented the excess + const entityStart = this._index - this.entityExcess + 1; + + if (entityStart > this.sectionStart) { + this.emitPartial( + this.buffer.substring(this.sectionStart, entityStart) + ); + } + // If this is a surrogate pair, combine the higher bits from the node with the next byte this.trieResult = this.trieCurrent & BinTrieFlags.MULTI_BYTE @@ -691,7 +691,8 @@ export default class Tokenizer { : String.fromCharCode( this.entityTrie[++this.trieIndex] ); - this.trieExcess = 0; + this.entityExcess = 0; + this.sectionStart = this._index + 1; } } } @@ -701,12 +702,12 @@ export default class Tokenizer { this.emitPartial(this.trieResult); } - this.sectionStart = this._index - this.trieExcess + 1; this._state = this.baseState; } private stateBeforeNumericEntity(c: number) { if ((c | 0x20) === CharCodes.LowerX) { + this.entityExcess++; this._state = State.InHexEntity; } else { this._state = State.InNumericEntity; @@ -715,10 +716,19 @@ export default class Tokenizer { } private decodeNumericEntity(base: 10 | 16, strict: boolean) { - const sectionStart = this.sectionStart + 2 + (base >> 4); - if (sectionStart !== this._index) { + const entityStart = this._index - this.entityExcess - 1; + const numberStart = entityStart + 2 + (base >> 4); + + if (numberStart !== this._index) { + // Emit leading data if any + if (entityStart > this.sectionStart) { + this.emitPartial( + this.buffer.substring(this.sectionStart, entityStart) + ); + } + // Parse entity - const entity = this.buffer.substring(sectionStart, this._index); + const entity = this.buffer.substring(numberStart, this._index); const parsed = parseInt(entity, base); this.emitPartial(decodeCodePoint(parsed)); this.sectionStart = this._index + Number(strict); @@ -728,13 +738,15 @@ export default class Tokenizer { private stateInNumericEntity(c: number) { if (c === CharCodes.Semi) { this.decodeNumericEntity(10, true); - } else if (c < CharCodes.Zero || c > CharCodes.Nine) { + } else if (!isNumber(c)) { if (this.allowLegacyEntity()) { this.decodeNumericEntity(10, false); } else { this._state = this.baseState; } this._index--; + } else { + this.entityExcess++; } } private stateInHexEntity(c: number) { @@ -743,7 +755,7 @@ export default class Tokenizer { } else if ( (c < CharCodes.LowerA || c > CharCodes.LowerF) && (c < CharCodes.UpperA || c > CharCodes.UpperF) && - (c < CharCodes.Zero || c > CharCodes.Nine) + !isNumber(c) ) { if (this.allowLegacyEntity()) { this.decodeNumericEntity(16, false); @@ -751,6 +763,8 @@ export default class Tokenizer { this._state = this.baseState; } this._index--; + } else { + this.entityExcess++; } } @@ -868,6 +882,10 @@ export default class Tokenizer { } private finish() { + if (this._state === State.InNamedEntity) { + this.emitNamedEntity(); + } + // If there is remaining data, emit it in a reasonable way if (this.sectionStart < this._index) { this.handleTrailingData(); @@ -884,18 +902,16 @@ export default class Tokenizer { } else { this.cbs.oncomment(data); } - } else if (this._state === State.InNamedEntity && !this.xmlMode) { - // Increase excess for EOF - this.trieExcess++; - this.emitNamedEntity(); - if (this.sectionStart < this._index) { - this._state = this.baseState; - this.handleTrailingData(); - } - } else if (this._state === State.InNumericEntity && !this.xmlMode) { + } else if ( + this._state === State.InNumericEntity && + this.allowLegacyEntity() + ) { this.decodeNumericEntity(10, false); // All trailing data will have been consumed - } else if (this._state === State.InHexEntity && !this.xmlMode) { + } else if ( + this._state === State.InHexEntity && + this.allowLegacyEntity() + ) { this.decodeNumericEntity(16, false); // All trailing data will have been consumed } else if (