Skip to content

Commit

Permalink
Build regexes only once
Browse files Browse the repository at this point in the history
speedup non-first Tokenizer bootstrap
  • Loading branch information
mvorisek committed Jun 23, 2024
1 parent e2a9b0d commit 993d0e0
Showing 1 changed file with 25 additions and 22 deletions.
47 changes: 25 additions & 22 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -719,14 +719,13 @@ final class Tokenizer
];

// Regular expressions for tokenizing

private readonly string $nextTokenRegexNumber;
private readonly string $nextTokenRegexBoundaryCharacter;
private readonly string $nextTokenRegexReservedToplevel;
private readonly string $nextTokenRegexReservedNewline;
private readonly string $nextTokenRegexReserved;
private readonly string $nextTokenRegexFunction;
private readonly string $nextTokenRegexNonReserved;
private static string $nextTokenRegexNumber;
private static string $nextTokenRegexBoundaryCharacter;
private static string $nextTokenRegexReservedToplevel;
private static string $nextTokenRegexReservedNewline;
private static string $nextTokenRegexReserved;
private static string $nextTokenRegexFunction;
private static string $nextTokenRegexNonReserved;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -762,20 +761,24 @@ final class Tokenizer
*/
public function __construct()
{
if (isset(self::$nextTokenRegexNumber)) {
return;
}

// Set up regular expressions
$regexBoundaries = $this->makeRegexFromList($this->boundaries);
$regexReserved = $this->makeRegexFromList($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline));
$regexFunction = $this->makeRegexFromList($this->functions);

$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
$this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
self::$nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
self::$nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
self::$nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
self::$nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
self::$nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
self::$nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
self::$nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
}

/** @param list<string> $values */
Expand Down Expand Up @@ -946,7 +949,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Number (decimal, binary, or hex)
if (
preg_match(
$this->nextTokenRegexNumber,
self::$nextTokenRegexNumber,
$string,
$matches,
0,
Expand All @@ -957,7 +960,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
}

// Boundary Character (punctuation and symbols)
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
if (preg_match(self::$nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]);
}

Expand All @@ -967,7 +970,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Top Level Reserved Word
if (
preg_match(
$this->nextTokenRegexReservedToplevel,
self::$nextTokenRegexReservedToplevel,
$upper,
$matches,
0,
Expand All @@ -983,7 +986,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Newline Reserved Word
if (
preg_match(
$this->nextTokenRegexReservedNewline,
self::$nextTokenRegexReservedNewline,
$upper,
$matches,
0,
Expand All @@ -999,7 +1002,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Other Reserved Word
if (
preg_match(
$this->nextTokenRegexReserved,
self::$nextTokenRegexReserved,
$upper,
$matches,
0,
Expand All @@ -1015,15 +1018,15 @@ private function createNextToken(string $string, string $upper, int $offset, Tok

// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not function
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
if (preg_match(self::$nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[0])),
);
}

// Non reserved word
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);
preg_match(self::$nextTokenRegexNonReserved, $string, $matches, 0, $offset);

return new Token(Token::TOKEN_TYPE_WORD, $matches[0]);
}
Expand Down

0 comments on commit 993d0e0

Please sign in to comment.