Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build regexes only once #133

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 25 additions & 22 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -719,14 +719,13 @@ final class Tokenizer
];

// Regular expressions for tokenizing

private readonly string $nextTokenRegexNumber;
private readonly string $nextTokenRegexBoundaryCharacter;
private readonly string $nextTokenRegexReservedToplevel;
private readonly string $nextTokenRegexReservedNewline;
private readonly string $nextTokenRegexReserved;
private readonly string $nextTokenRegexFunction;
private readonly string $nextTokenRegexNonReserved;
private static string $nextTokenRegexNumber;
private static string $nextTokenRegexBoundaryCharacter;
private static string $nextTokenRegexReservedToplevel;
private static string $nextTokenRegexReservedNewline;
private static string $nextTokenRegexReserved;
private static string $nextTokenRegexFunction;
private static string $nextTokenRegexNonReserved;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -762,20 +761,24 @@ final class Tokenizer
*/
public function __construct()
{
if (isset(self::$nextTokenRegexNumber)) {
return;
}

// Set up regular expressions
$regexBoundaries = $this->makeRegexFromList($this->boundaries);
$regexReserved = $this->makeRegexFromList($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline));
$regexFunction = $this->makeRegexFromList($this->functions);

$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
$this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
self::$nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
self::$nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
self::$nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
self::$nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
self::$nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
self::$nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
self::$nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
}

/**
Expand Down Expand Up @@ -953,7 +956,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Number (decimal, binary, or hex)
if (
preg_match(
$this->nextTokenRegexNumber,
self::$nextTokenRegexNumber,
$string,
$matches,
0,
Expand All @@ -964,7 +967,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
}

// Boundary Character (punctuation and symbols)
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
if (preg_match(self::$nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]);
}

Expand All @@ -974,7 +977,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Top Level Reserved Word
if (
preg_match(
$this->nextTokenRegexReservedToplevel,
self::$nextTokenRegexReservedToplevel,
$upper,
$matches,
0,
Expand All @@ -990,7 +993,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Newline Reserved Word
if (
preg_match(
$this->nextTokenRegexReservedNewline,
self::$nextTokenRegexReservedNewline,
$upper,
$matches,
0,
Expand All @@ -1006,7 +1009,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
// Other Reserved Word
if (
preg_match(
$this->nextTokenRegexReserved,
self::$nextTokenRegexReserved,
$upper,
$matches,
0,
Expand All @@ -1022,15 +1025,15 @@ private function createNextToken(string $string, string $upper, int $offset, Tok

// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not function
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
if (preg_match(self::$nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[0])),
);
}

// Non reserved word
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);
preg_match(self::$nextTokenRegexNonReserved, $string, $matches, 0, $offset);

return new Token(Token::TOKEN_TYPE_WORD, $matches[0]);
}
Expand Down