Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(tokenizer): Emit text before entities once entity is confirmed #1009

Merged
merged 1 commit into from
Nov 11, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 54 additions & 38 deletions src/Tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ function isEndOfTagSection(c: number): boolean {
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c);
}

function isNumber(c: number): boolean {
return c >= CharCodes.Zero && c <= CharCodes.Nine;
}

function isASCIIAlpha(c: number): boolean {
return (
(c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
Expand Down Expand Up @@ -243,11 +247,7 @@ export default class Tokenizer {
this._state = State.BeforeTagName;
this.sectionStart = this._index;
} else if (this.decodeEntities && c === CharCodes.Amp) {
if (this._index > this.sectionStart) {
this.cbs.ontext(this.getSection());
}
this._state = State.BeforeEntity;
this.sectionStart = this._index;
}
}

Expand Down Expand Up @@ -302,11 +302,7 @@ export default class Tokenizer {
if (this.currentSequence === Sequences.TitleEnd) {
// We have to parse entities in <title> tags.
if (this.decodeEntities && c === CharCodes.Amp) {
if (this._index > this.sectionStart) {
this.cbs.ontext(this.getSection());
}
this._state = State.BeforeEntity;
this.sectionStart = this._index;
}
} else if (this.fastForwardTo(CharCodes.Lt)) {
// Outside of <title> tags, we can fast-forward.
Expand Down Expand Up @@ -546,10 +542,8 @@ export default class Tokenizer {
this.cbs.onattribend(String.fromCharCode(quote));
this._state = State.BeforeAttributeName;
} else if (this.decodeEntities && c === CharCodes.Amp) {
this.cbs.onattribdata(this.getSection());
this.baseState = this._state;
this._state = State.BeforeEntity;
this.sectionStart = this._index;
}
}
private stateInAttributeValueDoubleQuotes(c: number) {
Expand All @@ -566,10 +560,8 @@ export default class Tokenizer {
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (this.decodeEntities && c === CharCodes.Amp) {
this.cbs.onattribdata(this.getSection());
this.baseState = this._state;
this._state = State.BeforeEntity;
this.sectionStart = this._index;
}
}
private stateBeforeDeclaration(c: number) {
Expand Down Expand Up @@ -636,28 +628,27 @@ export default class Tokenizer {
private trieIndex = 0;
private trieCurrent = 0;
private trieResult: string | null = null;
private trieExcess = 0;
private entityExcess = 0;

private stateBeforeEntity(c: number) {
// Start excess with 1 to include the '&'
this.entityExcess = 1;

if (c === CharCodes.Num) {
this._state = State.BeforeNumericEntity;
} else if (c === CharCodes.Amp) {
// We have two `&` characters in a row. Emit the first one.
this.emitPartial(this.getSection());
this.sectionStart = this._index;
// We have two `&` characters in a row. Stay in the current state.
} else {
this._state = State.InNamedEntity;
this.trieIndex = 0;
this.trieCurrent = this.entityTrie[0];
this.trieResult = null;
// Start excess with 1 to include the '&'
this.trieExcess = 1;
this._index--;
this._state = State.InNamedEntity;
this.stateInNamedEntity(c);
}
}

private stateInNamedEntity(c: number) {
this.trieExcess += 1;
this.entityExcess += 1;

this.trieIndex = determineBranch(
this.entityTrie,
Expand All @@ -681,6 +672,15 @@ export default class Tokenizer {
// No need to consider multi-byte values, as the legacy entity is always a single byte
this.trieIndex += 1;
} else {
// Add 1 as we have already incremented the excess
const entityStart = this._index - this.entityExcess + 1;

if (entityStart > this.sectionStart) {
this.emitPartial(
this.buffer.substring(this.sectionStart, entityStart)
);
}

// If this is a surrogate pair, combine the higher bits from the node with the next byte
this.trieResult =
this.trieCurrent & BinTrieFlags.MULTI_BYTE
Expand All @@ -691,7 +691,8 @@ export default class Tokenizer {
: String.fromCharCode(
this.entityTrie[++this.trieIndex]
);
this.trieExcess = 0;
this.entityExcess = 0;
this.sectionStart = this._index + 1;
}
}
}
Expand All @@ -701,12 +702,12 @@ export default class Tokenizer {
this.emitPartial(this.trieResult);
}

this.sectionStart = this._index - this.trieExcess + 1;
this._state = this.baseState;
}

private stateBeforeNumericEntity(c: number) {
if ((c | 0x20) === CharCodes.LowerX) {
this.entityExcess++;
this._state = State.InHexEntity;
} else {
this._state = State.InNumericEntity;
Expand All @@ -715,10 +716,19 @@ export default class Tokenizer {
}

private decodeNumericEntity(base: 10 | 16, strict: boolean) {
const sectionStart = this.sectionStart + 2 + (base >> 4);
if (sectionStart !== this._index) {
const entityStart = this._index - this.entityExcess - 1;
const numberStart = entityStart + 2 + (base >> 4);

if (numberStart !== this._index) {
// Emit leading data if any
if (entityStart > this.sectionStart) {
this.emitPartial(
this.buffer.substring(this.sectionStart, entityStart)
);
}

// Parse entity
const entity = this.buffer.substring(sectionStart, this._index);
const entity = this.buffer.substring(numberStart, this._index);
const parsed = parseInt(entity, base);
this.emitPartial(decodeCodePoint(parsed));
this.sectionStart = this._index + Number(strict);
Expand All @@ -728,13 +738,15 @@ export default class Tokenizer {
private stateInNumericEntity(c: number) {
if (c === CharCodes.Semi) {
this.decodeNumericEntity(10, true);
} else if (c < CharCodes.Zero || c > CharCodes.Nine) {
} else if (!isNumber(c)) {
if (this.allowLegacyEntity()) {
this.decodeNumericEntity(10, false);
} else {
this._state = this.baseState;
}
this._index--;
} else {
this.entityExcess++;
}
}
private stateInHexEntity(c: number) {
Expand All @@ -743,14 +755,16 @@ export default class Tokenizer {
} else if (
(c < CharCodes.LowerA || c > CharCodes.LowerF) &&
(c < CharCodes.UpperA || c > CharCodes.UpperF) &&
(c < CharCodes.Zero || c > CharCodes.Nine)
!isNumber(c)
) {
if (this.allowLegacyEntity()) {
this.decodeNumericEntity(16, false);
} else {
this._state = this.baseState;
}
this._index--;
} else {
this.entityExcess++;
}
}

Expand Down Expand Up @@ -868,6 +882,10 @@ export default class Tokenizer {
}

private finish() {
if (this._state === State.InNamedEntity) {
this.emitNamedEntity();
}

// If there is remaining data, emit it in a reasonable way
if (this.sectionStart < this._index) {
this.handleTrailingData();
Expand All @@ -884,18 +902,16 @@ export default class Tokenizer {
} else {
this.cbs.oncomment(data);
}
} else if (this._state === State.InNamedEntity && !this.xmlMode) {
// Increase excess for EOF
this.trieExcess++;
this.emitNamedEntity();
if (this.sectionStart < this._index) {
this._state = this.baseState;
this.handleTrailingData();
}
} else if (this._state === State.InNumericEntity && !this.xmlMode) {
} else if (
this._state === State.InNumericEntity &&
this.allowLegacyEntity()
) {
this.decodeNumericEntity(10, false);
// All trailing data will have been consumed
} else if (this._state === State.InHexEntity && !this.xmlMode) {
} else if (
this._state === State.InHexEntity &&
this.allowLegacyEntity()
) {
this.decodeNumericEntity(16, false);
// All trailing data will have been consumed
} else if (
Expand Down