Handle numbers formatted with underscores in tokenizer (DonJayamanne#…

…1819) * Undo changes * Test fixes * Increase timeout * Remove double event listening * Remove test * Revert "Remove test" This reverts commit e240c3f. * Revert "Remove double event listening" This reverts commit af573be. * #1096 The if statement is automatically formatted incorrectly * Merge fix * Add more tests * More tests * Typo * Test * Also better handle multiline arguments * Add a couple missing periods [skip ci] * Undo changes * Test fixes * Increase timeout * Remove double event listening * Remove test * Revert "Remove test" This reverts commit e240c3f. * Revert "Remove double event listening" This reverts commit af573be. * Merge fix * DonJayamanne#1257 On type formatting errors for args and kwargs * Handle f-strings * Stop importing from test code * DonJayamanne#1308 Single line statements leading to an indentation on the next line * #726 editing python after inline if statement invalid indent * Undo change * Move constant * Harden LS startup error checks * #1364 Intellisense doesn't work after specific const string * Telemetry for the analysis enging * PR feedback * Fix typo * Test baseline update * Jedi 0.12 * Priority to goto_defition * News * Replace unzip * Linux flavors + test * Grammar check * Grammar test * Test baselines * Add news * Pin dependency [skip ci] * Specify markdown as preferable format * Improve function argument detection * Specify markdown * Pythia setting * Baseline updates * Baseline update * Improve startup * Handle missing interpreter better * Handle interpreter change * Delete old file * Fix LS startup time reporting * Remove Async suffix from IFileSystem * Remove Pythia * Remove pre-packaged MSIL * Exe name on Unix * Plain linux * Fix casing * Fix message * Update PTVS engine activation steps * Type formatter eats space in from . * fIX CASING * Remove flag * Don't wait for LS * Small test fixes * Update hover baselines * Rename the engine * Formatting 1 * Add support for 'rf' strings * Add two spaces before comment per PEP * Fix @ operator spacing * Handle module and unary ops * Type hints * Fix typo * Trailing comma * Require space after if * underscore numbers * Update list of keywords * PR feedback * News * Use a bit more Markdown in the news entry
mostafaeweda · Aug 30, 2018 · cb03032 · cb03032
1 parent 114afdd
commit cb03032
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 33 deletions.
diff --git a/news/2 Fixes/1779.md b/news/2 Fixes/1779.md
@@ -0,0 +1 @@
+`editor.formatOnType` no longer breaks numbers formatted with underscores.
diff --git a/src/client/language/characters.ts b/src/client/language/characters.ts
@@ -83,18 +83,22 @@ export function isLineBreak(ch: number): boolean {
  return ch === Char.CarriageReturn || ch === Char.LineFeed;
 }
 
+export function isNumber(ch: number): boolean {
+ return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
+}
+
 export function isDecimal(ch: number): boolean {
- return ch >= Char._0 && ch <= Char._9;
+ return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
 }
 
 export function isHex(ch: number): boolean {
- return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F);
+ return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F) || ch === Char.Underscore;
 }
 
 export function isOctal(ch: number): boolean {
- return ch >= Char._0 && ch <= Char._7;
+ return ch >= Char._0 && ch <= Char._7 || ch === Char.Underscore;
 }
 
 export function isBinary(ch: number): boolean {
- return ch === Char._0 || ch === Char._1;
+ return ch === Char._0 || ch === Char._1 || ch === Char.Underscore;
 }
diff --git a/src/client/language/tokenizer.ts b/src/client/language/tokenizer.ts
@@ -4,7 +4,7 @@
 
 // tslint:disable-next-line:import-name
 import Char from 'typescript-char';
-import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal } from './characters';
+import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal, isWhiteSpace } from './characters';
 import { CharacterStream } from './characterStream';
 import { TextRangeCollection } from './textRangeCollection';
 import { ICharacterStream, ITextRangeCollection, IToken, ITokenizer, TextRange, TokenizerMode, TokenType } from './types';
@@ -29,13 +29,8 @@ class Token extends TextRange implements IToken {
 export class Tokenizer implements ITokenizer {
  private cs: ICharacterStream = new CharacterStream('');
  private tokens: IToken[] = [];
- private floatRegex = /[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?/;
  private mode = TokenizerMode.Full;
 
- constructor() {
- //this.floatRegex.compile();
- }
-
  public tokenize(text: string): ITextRangeCollection<IToken>;
  public tokenize(text: string, start: number, length: number, mode: TokenizerMode): ITextRangeCollection<IToken>;
 
@@ -224,43 +219,74 @@ export class Tokenizer implements ITokenizer {
 
  if (this.cs.currentChar === Char._0) {
  let radix = 0;
- // Try hex
- if (this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) {
+ // Try hex => hexinteger: "0" ("x" | "X") (["_"] hexdigit)+
+ if ((this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) && isHex(this.cs.lookAhead(2))) {
  this.cs.advance(2);
  while (isHex(this.cs.currentChar)) {
  this.cs.moveNext();
  }
  radix = 16;
  }
- // Try binary
- if (this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) {
+ // Try binary => bininteger: "0" ("b" | "B") (["_"] bindigit)+
+ if ((this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) && isBinary(this.cs.lookAhead(2))) {
  this.cs.advance(2);
  while (isBinary(this.cs.currentChar)) {
  this.cs.moveNext();
  }
  radix = 2;
  }
- // Try octal
- if (this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) {
+ // Try octal => octinteger: "0" ("o" | "O") (["_"] octdigit)+
+ if ((this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) && isOctal(this.cs.lookAhead(2))) {
  this.cs.advance(2);
  while (isOctal(this.cs.currentChar)) {
  this.cs.moveNext();
  }
  radix = 8;
  }
+ if (radix > 0) {
+ const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
+ if (!isNaN(parseInt(text, radix))) {
+ this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
+ return true;
+ }
+ }
+ }
+
+ let decimal = false;
+ // Try decimal int =>
+ // decinteger: nonzerodigit (["_"] digit)* | "0" (["_"] "0")*
+ // nonzerodigit: "1"..."9"
+ // digit: "0"..."9"
+ if (this.cs.currentChar >= Char._1 && this.cs.currentChar <= Char._9) {
+ while (isDecimal(this.cs.currentChar)) {
+ this.cs.moveNext();
+ }
+ decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
+ }
+
+ if (this.cs.currentChar === Char._0) { // "0" (["_"] "0")*
+ while (this.cs.currentChar === Char._0 || this.cs.currentChar === Char.Underscore) {
+ this.cs.moveNext();
+ }
+ decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
+ }
+
+ if (decimal) {
  const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
- if (radix > 0 && parseInt(text.substr(2), radix)) {
+ if (!isNaN(parseInt(text, 10))) {
  this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
  return true;
  }
  }
 
- if (isDecimal(this.cs.currentChar) || this.cs.currentChar === Char.Period) {
- const candidate = this.cs.getText().substr(this.cs.position);
- const re = this.floatRegex.exec(candidate);
- if (re && re.length > 0 && re[0] && candidate.startsWith(re[0])) {
- this.tokens.push(new Token(TokenType.Number, start, re[0].length + leadingSign));
- this.cs.position = start + re[0].length + leadingSign;
+ // Floating point
+ if ((this.cs.currentChar >= Char._0 && this.cs.currentChar <= Char._9) || this.cs.currentChar === Char.Period) {
+ while (!isWhiteSpace(this.cs.currentChar)) {
+ this.cs.moveNext();
+ }
+ const text = this.cs.getText().substr(start, this.cs.position - start);
+ if (!isNaN(parseFloat(text))) {
+ this.tokens.push(new Token(TokenType.Number, start, this.cs.position - start));
  return true;
  }
  }
@@ -380,7 +406,7 @@ export class Tokenizer implements ITokenizer {
  case 'rf':
  case 'ur':
  case 'br':
-  return 2;
+ return 2;
  default:
  break;
  }

diff --git a/src/test/language/tokenizer.test.ts b/src/test/language/tokenizer.test.ts
@@ -193,7 +193,7 @@ suite('Language.Tokenizer', () => {
  test('Hex number', () => {
  const t = new Tokenizer();
  const tokens = t.tokenize('1 0X2 0x3 0x');
- assert.equal(tokens.count, 4);
+ assert.equal(tokens.count, 5);
 
  assert.equal(tokens.getItemAt(0).type, TokenType.Number);
  assert.equal(tokens.getItemAt(0).length, 1);
@@ -204,13 +204,16 @@ suite('Language.Tokenizer', () => {
  assert.equal(tokens.getItemAt(2).type, TokenType.Number);
  assert.equal(tokens.getItemAt(2).length, 3);
 
- assert.equal(tokens.getItemAt(3).type, TokenType.Unknown);
- assert.equal(tokens.getItemAt(3).length, 2);
+ assert.equal(tokens.getItemAt(3).type, TokenType.Number);
+ assert.equal(tokens.getItemAt(3).length, 1);
+
+ assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
+ assert.equal(tokens.getItemAt(4).length, 1);
  });
  test('Binary number', () => {
  const t = new Tokenizer();
  const tokens = t.tokenize('1 0B1 0b010 0b3 0b');
- assert.equal(tokens.count, 6);
+ assert.equal(tokens.count, 7);
 
  assert.equal(tokens.getItemAt(0).type, TokenType.Number);
  assert.equal(tokens.getItemAt(0).length, 1);
@@ -227,13 +230,16 @@ suite('Language.Tokenizer', () => {
  assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
  assert.equal(tokens.getItemAt(4).length, 2);
 
- assert.equal(tokens.getItemAt(5).type, TokenType.Unknown);
- assert.equal(tokens.getItemAt(5).length, 2);
+ assert.equal(tokens.getItemAt(5).type, TokenType.Number);
+ assert.equal(tokens.getItemAt(5).length, 1);
+
+ assert.equal(tokens.getItemAt(6).type, TokenType.Identifier);
+ assert.equal(tokens.getItemAt(6).length, 1);
  });
  test('Octal number', () => {
  const t = new Tokenizer();
  const tokens = t.tokenize('1 0o4 0o077 -0o200 0o9 0oO');
- assert.equal(tokens.count, 7);
+ assert.equal(tokens.count, 8);
 
  assert.equal(tokens.getItemAt(0).type, TokenType.Number);
  assert.equal(tokens.getItemAt(0).length, 1);
@@ -253,8 +259,11 @@ suite('Language.Tokenizer', () => {
  assert.equal(tokens.getItemAt(5).type, TokenType.Identifier);
  assert.equal(tokens.getItemAt(5).length, 2);
 
- assert.equal(tokens.getItemAt(6).type, TokenType.Unknown);
- assert.equal(tokens.getItemAt(6).length, 3);
+ assert.equal(tokens.getItemAt(6).type, TokenType.Number);
+ assert.equal(tokens.getItemAt(6).length, 1);
+
+ assert.equal(tokens.getItemAt(7).type, TokenType.Identifier);
+ assert.equal(tokens.getItemAt(7).length, 2);
  });
  test('Decimal number', () => {
  const t = new Tokenizer();
@@ -301,6 +310,17 @@ suite('Language.Tokenizer', () => {
  assert.equal(tokens.getItemAt(5).type, TokenType.Number);
  assert.equal(tokens.getItemAt(5).length, 5);
  });
+ test('Underscore numbers', () => {
+ const t = new Tokenizer();
+ const tokens = t.tokenize('+1_0_0_0 0_0 .5_00_3e-4 0xCAFE_F00D 10_000_000.0 0b_0011_1111_0100_1110');
+ const lengths = [8, 3, 10, 11, 12, 22];
+ assert.equal(tokens.count, 6);
+
+ for (let i = 0; i < tokens.count; i += 1) {
+ assert.equal(tokens.getItemAt(i).type, TokenType.Number);
+ assert.equal(tokens.getItemAt(i).length, lengths[i]);
+ }
+ });
  test('Simple expression, leading minus', () => {
  const t = new Tokenizer();
  const tokens = t.tokenize('x == -y');