Skip to content

Commit

Permalink
Handle numbers formatted with underscores in tokenizer (DonJayamanne#…
Browse files Browse the repository at this point in the history
…1819)

* Undo changes

* Test fixes

* Increase timeout

* Remove double event listening

* Remove test

* Revert "Remove test"

This reverts commit e240c3f.

* Revert "Remove double event listening"

This reverts commit af573be.

* #1096 The if statement is automatically formatted incorrectly

* Merge fix

* Add more tests

* More tests

* Typo

* Test

* Also better handle multiline arguments

* Add a couple missing periods

[skip ci]

* Undo changes

* Test fixes

* Increase timeout

* Remove double event listening

* Remove test

* Revert "Remove test"

This reverts commit e240c3f.

* Revert "Remove double event listening"

This reverts commit af573be.

* Merge fix

* DonJayamanne#1257 On type formatting errors for args and kwargs

* Handle f-strings

* Stop importing from test code

* DonJayamanne#1308 Single line statements leading to an indentation on the next line

* #726 editing python after inline if statement invalid indent

* Undo change

* Move constant

* Harden LS startup error checks

* #1364 Intellisense doesn't work after specific const string

* Telemetry for the analysis enging

* PR feedback

* Fix typo

* Test baseline update

* Jedi 0.12

* Priority to goto_defition

* News

* Replace unzip

* Linux flavors + test

* Grammar check

* Grammar test

* Test baselines

* Add news

* Pin dependency

[skip ci]

* Specify markdown as preferable format

* Improve function argument detection

* Specify markdown

* Pythia setting

* Baseline updates

* Baseline update

* Improve startup

* Handle missing interpreter better

* Handle interpreter change

* Delete old file

* Fix LS startup time reporting

* Remove Async suffix from IFileSystem

* Remove Pythia

* Remove pre-packaged MSIL

* Exe name on Unix

* Plain linux

* Fix casing

* Fix message

* Update PTVS engine activation steps

* Type formatter eats space in from .

* fIX CASING

* Remove flag

* Don't wait for LS

* Small test fixes

* Update hover baselines

* Rename the engine

* Formatting 1

* Add support for 'rf' strings

* Add two spaces before comment per PEP

* Fix @ operator spacing

* Handle module and unary ops

* Type hints

* Fix typo

* Trailing comma

* Require space after if

* underscore numbers

* Update list of keywords

* PR feedback

* News

* Use a bit more Markdown in the news entry
  • Loading branch information
Mikhail Arkhipov authored and Aman Agarwal committed Aug 30, 2018
1 parent 114afdd commit cb03032
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 33 deletions.
1 change: 1 addition & 0 deletions news/2 Fixes/1779.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
`editor.formatOnType` no longer breaks numbers formatted with underscores.
12 changes: 8 additions & 4 deletions src/client/language/characters.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,22 @@ export function isLineBreak(ch: number): boolean {
return ch === Char.CarriageReturn || ch === Char.LineFeed;
}

export function isNumber(ch: number): boolean {
return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
}

export function isDecimal(ch: number): boolean {
return ch >= Char._0 && ch <= Char._9;
return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
}

export function isHex(ch: number): boolean {
return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F);
return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F) || ch === Char.Underscore;
}

export function isOctal(ch: number): boolean {
return ch >= Char._0 && ch <= Char._7;
return ch >= Char._0 && ch <= Char._7 || ch === Char.Underscore;
}

export function isBinary(ch: number): boolean {
return ch === Char._0 || ch === Char._1;
return ch === Char._0 || ch === Char._1 || ch === Char.Underscore;
}
66 changes: 46 additions & 20 deletions src/client/language/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

// tslint:disable-next-line:import-name
import Char from 'typescript-char';
import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal } from './characters';
import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal, isWhiteSpace } from './characters';
import { CharacterStream } from './characterStream';
import { TextRangeCollection } from './textRangeCollection';
import { ICharacterStream, ITextRangeCollection, IToken, ITokenizer, TextRange, TokenizerMode, TokenType } from './types';
Expand All @@ -29,13 +29,8 @@ class Token extends TextRange implements IToken {
export class Tokenizer implements ITokenizer {
private cs: ICharacterStream = new CharacterStream('');
private tokens: IToken[] = [];
private floatRegex = /[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?/;
private mode = TokenizerMode.Full;

constructor() {
//this.floatRegex.compile();
}

public tokenize(text: string): ITextRangeCollection<IToken>;
public tokenize(text: string, start: number, length: number, mode: TokenizerMode): ITextRangeCollection<IToken>;

Expand Down Expand Up @@ -224,43 +219,74 @@ export class Tokenizer implements ITokenizer {

if (this.cs.currentChar === Char._0) {
let radix = 0;
// Try hex
if (this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) {
// Try hex => hexinteger: "0" ("x" | "X") (["_"] hexdigit)+
if ((this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) && isHex(this.cs.lookAhead(2))) {
this.cs.advance(2);
while (isHex(this.cs.currentChar)) {
this.cs.moveNext();
}
radix = 16;
}
// Try binary
if (this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) {
// Try binary => bininteger: "0" ("b" | "B") (["_"] bindigit)+
if ((this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) && isBinary(this.cs.lookAhead(2))) {
this.cs.advance(2);
while (isBinary(this.cs.currentChar)) {
this.cs.moveNext();
}
radix = 2;
}
// Try octal
if (this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) {
// Try octal => octinteger: "0" ("o" | "O") (["_"] octdigit)+
if ((this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) && isOctal(this.cs.lookAhead(2))) {
this.cs.advance(2);
while (isOctal(this.cs.currentChar)) {
this.cs.moveNext();
}
radix = 8;
}
if (radix > 0) {
const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
if (!isNaN(parseInt(text, radix))) {
this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
return true;
}
}
}

let decimal = false;
// Try decimal int =>
// decinteger: nonzerodigit (["_"] digit)* | "0" (["_"] "0")*
// nonzerodigit: "1"..."9"
// digit: "0"..."9"
if (this.cs.currentChar >= Char._1 && this.cs.currentChar <= Char._9) {
while (isDecimal(this.cs.currentChar)) {
this.cs.moveNext();
}
decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
}

if (this.cs.currentChar === Char._0) { // "0" (["_"] "0")*
while (this.cs.currentChar === Char._0 || this.cs.currentChar === Char.Underscore) {
this.cs.moveNext();
}
decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
}

if (decimal) {
const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
if (radix > 0 && parseInt(text.substr(2), radix)) {
if (!isNaN(parseInt(text, 10))) {
this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
return true;
}
}

if (isDecimal(this.cs.currentChar) || this.cs.currentChar === Char.Period) {
const candidate = this.cs.getText().substr(this.cs.position);
const re = this.floatRegex.exec(candidate);
if (re && re.length > 0 && re[0] && candidate.startsWith(re[0])) {
this.tokens.push(new Token(TokenType.Number, start, re[0].length + leadingSign));
this.cs.position = start + re[0].length + leadingSign;
// Floating point
if ((this.cs.currentChar >= Char._0 && this.cs.currentChar <= Char._9) || this.cs.currentChar === Char.Period) {
while (!isWhiteSpace(this.cs.currentChar)) {
this.cs.moveNext();
}
const text = this.cs.getText().substr(start, this.cs.position - start);
if (!isNaN(parseFloat(text))) {
this.tokens.push(new Token(TokenType.Number, start, this.cs.position - start));
return true;
}
}
Expand Down Expand Up @@ -380,7 +406,7 @@ export class Tokenizer implements ITokenizer {
case 'rf':
case 'ur':
case 'br':
return 2;
return 2;
default:
break;
}
Expand Down
38 changes: 29 additions & 9 deletions src/test/language/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ suite('Language.Tokenizer', () => {
test('Hex number', () => {
const t = new Tokenizer();
const tokens = t.tokenize('1 0X2 0x3 0x');
assert.equal(tokens.count, 4);
assert.equal(tokens.count, 5);

assert.equal(tokens.getItemAt(0).type, TokenType.Number);
assert.equal(tokens.getItemAt(0).length, 1);
Expand All @@ -204,13 +204,16 @@ suite('Language.Tokenizer', () => {
assert.equal(tokens.getItemAt(2).type, TokenType.Number);
assert.equal(tokens.getItemAt(2).length, 3);

assert.equal(tokens.getItemAt(3).type, TokenType.Unknown);
assert.equal(tokens.getItemAt(3).length, 2);
assert.equal(tokens.getItemAt(3).type, TokenType.Number);
assert.equal(tokens.getItemAt(3).length, 1);

assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
assert.equal(tokens.getItemAt(4).length, 1);
});
test('Binary number', () => {
const t = new Tokenizer();
const tokens = t.tokenize('1 0B1 0b010 0b3 0b');
assert.equal(tokens.count, 6);
assert.equal(tokens.count, 7);

assert.equal(tokens.getItemAt(0).type, TokenType.Number);
assert.equal(tokens.getItemAt(0).length, 1);
Expand All @@ -227,13 +230,16 @@ suite('Language.Tokenizer', () => {
assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
assert.equal(tokens.getItemAt(4).length, 2);

assert.equal(tokens.getItemAt(5).type, TokenType.Unknown);
assert.equal(tokens.getItemAt(5).length, 2);
assert.equal(tokens.getItemAt(5).type, TokenType.Number);
assert.equal(tokens.getItemAt(5).length, 1);

assert.equal(tokens.getItemAt(6).type, TokenType.Identifier);
assert.equal(tokens.getItemAt(6).length, 1);
});
test('Octal number', () => {
const t = new Tokenizer();
const tokens = t.tokenize('1 0o4 0o077 -0o200 0o9 0oO');
assert.equal(tokens.count, 7);
assert.equal(tokens.count, 8);

assert.equal(tokens.getItemAt(0).type, TokenType.Number);
assert.equal(tokens.getItemAt(0).length, 1);
Expand All @@ -253,8 +259,11 @@ suite('Language.Tokenizer', () => {
assert.equal(tokens.getItemAt(5).type, TokenType.Identifier);
assert.equal(tokens.getItemAt(5).length, 2);

assert.equal(tokens.getItemAt(6).type, TokenType.Unknown);
assert.equal(tokens.getItemAt(6).length, 3);
assert.equal(tokens.getItemAt(6).type, TokenType.Number);
assert.equal(tokens.getItemAt(6).length, 1);

assert.equal(tokens.getItemAt(7).type, TokenType.Identifier);
assert.equal(tokens.getItemAt(7).length, 2);
});
test('Decimal number', () => {
const t = new Tokenizer();
Expand Down Expand Up @@ -301,6 +310,17 @@ suite('Language.Tokenizer', () => {
assert.equal(tokens.getItemAt(5).type, TokenType.Number);
assert.equal(tokens.getItemAt(5).length, 5);
});
test('Underscore numbers', () => {
const t = new Tokenizer();
const tokens = t.tokenize('+1_0_0_0 0_0 .5_00_3e-4 0xCAFE_F00D 10_000_000.0 0b_0011_1111_0100_1110');
const lengths = [8, 3, 10, 11, 12, 22];
assert.equal(tokens.count, 6);

for (let i = 0; i < tokens.count; i += 1) {
assert.equal(tokens.getItemAt(i).type, TokenType.Number);
assert.equal(tokens.getItemAt(i).length, lengths[i]);
}
});
test('Simple expression, leading minus', () => {
const t = new Tokenizer();
const tokens = t.tokenize('x == -y');
Expand Down

0 comments on commit cb03032

Please sign in to comment.