Skip to content

Commit

Permalink
Insert T_BAD_CHARACTER tokens for missing characters
Browse files Browse the repository at this point in the history
The token stream should cover all characters in the original code,
insert a dummy token for missing illegal characters. We should
really be doing this in token_get_all() as well.
  • Loading branch information
nikic committed Jun 30, 2019
1 parent a4b43ed commit b9b45dd
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 39 deletions.
28 changes: 19 additions & 9 deletions lib/PhpParser/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@

class Lexer
{
/* Token ID used for illegal characters part of the token stream. These are dropped by token_get_all(),
* but we restore them here to make sure that the tokens cover the full original text, and to prevent
* file positions from going out of sync. */
const T_BAD_CHARACTER = -1;

protected $code;
protected $tokens;
protected $pos;
Expand Down Expand Up @@ -40,7 +45,7 @@ public function __construct(array $options = []) {
// map of tokens to drop while lexing (the map is only used for isset lookup,
// that's why the value is simply set to 1; the value is never actually used.)
$this->dropTokens = array_fill_keys(
[\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT], 1
[\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, self::T_BAD_CHARACTER], 1
);

$defaultAttributes = ['comments', 'startLine', 'endLine'];
Expand Down Expand Up @@ -92,13 +97,9 @@ public function startLexing(string $code, ErrorHandler $errorHandler = null) {
}

private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
$tokens = [];
for ($i = $start; $i < $end; $i++) {
$chr = $this->code[$i];
if ($chr === 'b' || $chr === 'B') {
// HHVM does not treat b" tokens correctly, so ignore these
continue;
}

if ($chr === "\0") {
// PHP cuts error message after null byte, so need special case
$errorMsg = 'Unexpected null byte';
Expand All @@ -108,13 +109,15 @@ private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $
);
}

$tokens[] = [self::T_BAD_CHARACTER, $chr, $line];
$errorHandler->handleError(new Error($errorMsg, [
'startLine' => $line,
'endLine' => $line,
'startFilePos' => $i,
'endFilePos' => $i,
]));
}
return $tokens;
}

/**
Expand Down Expand Up @@ -155,16 +158,22 @@ protected function handleErrors(ErrorHandler $errorHandler) {

$filePos = 0;
$line = 1;
foreach ($this->tokens as $token) {
$numTokens = \count($this->tokens);
for ($i = 0; $i < $numTokens; $i++) {
$token = $this->tokens[$i];
$tokenValue = \is_string($token) ? $token : $token[1];
$tokenLen = \strlen($tokenValue);

if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
// Something is missing, must be an invalid character
$nextFilePos = strpos($this->code, $tokenValue, $filePos);
$this->handleInvalidCharacterRange(
$badCharTokens = $this->handleInvalidCharacterRange(
$filePos, $nextFilePos, $line, $errorHandler);
$filePos = (int) $nextFilePos;

array_splice($this->tokens, $i, 0, $badCharTokens);
$numTokens += \count($badCharTokens);
$i += \count($badCharTokens);
}

$filePos += $tokenLen;
Expand All @@ -187,8 +196,9 @@ protected function handleErrors(ErrorHandler $errorHandler) {
$this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line];
} else {
// Invalid characters at the end of the input
$this->handleInvalidCharacterRange(
$badCharTokens = $this->handleInvalidCharacterRange(
$filePos, \strlen($this->code), $line, $errorHandler);
$this->tokens = array_merge($this->tokens, $badCharTokens);
}
return;
}
Expand Down
63 changes: 33 additions & 30 deletions test/code/parser/errorHandling/lexerErrors.test
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,25 @@ $a = 42;
@@{ "\1" }@@
$b = 24;
-----
!!positions
Unexpected character "" (ASCII 1) from 4:1 to 4:1
array(
0: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
0: Stmt_Expression[3:1 - 3:8](
expr: Expr_Assign[3:1 - 3:7](
var: Expr_Variable[3:1 - 3:2](
name: a
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[3:6 - 3:7](
value: 42
)
)
)
1: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
1: Stmt_Expression[5:1 - 5:8](
expr: Expr_Assign[5:1 - 5:7](
var: Expr_Variable[5:1 - 5:2](
name: b
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[5:6 - 5:7](
value: 24
)
)
Expand All @@ -62,24 +63,25 @@ $a = 42;
@@{ "\0" }@@
$b = 24;
-----
!!positions
Unexpected null byte from 4:1 to 4:1
array(
0: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
0: Stmt_Expression[3:1 - 3:8](
expr: Expr_Assign[3:1 - 3:7](
var: Expr_Variable[3:1 - 3:2](
name: a
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[3:6 - 3:7](
value: 42
)
)
)
1: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
1: Stmt_Expression[5:1 - 5:8](
expr: Expr_Assign[5:1 - 5:7](
var: Expr_Variable[5:1 - 5:2](
name: b
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[5:6 - 5:7](
value: 24
)
)
Expand All @@ -94,35 +96,36 @@ $b = 2;
@@{ "\2" }@@
$c = 3;
-----
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1
!!positions
Unexpected character "" (ASCII 1) from 4:1 to 4:1
Unexpected character "" (ASCII 2) from 6:1 to 6:1
array(
0: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
0: Stmt_Expression[3:1 - 3:7](
expr: Expr_Assign[3:1 - 3:6](
var: Expr_Variable[3:1 - 3:2](
name: a
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[3:6 - 3:6](
value: 1
)
)
)
1: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
1: Stmt_Expression[5:1 - 5:7](
expr: Expr_Assign[5:1 - 5:6](
var: Expr_Variable[5:1 - 5:2](
name: b
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[5:6 - 5:6](
value: 2
)
)
)
2: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
2: Stmt_Expression[7:1 - 7:7](
expr: Expr_Assign[7:1 - 7:6](
var: Expr_Variable[7:1 - 7:2](
name: c
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[7:6 - 7:6](
value: 3
)
)
Expand Down

0 comments on commit b9b45dd

Please sign in to comment.