carbon-language · zygoloid · Feb 19, 2021 · Feb 12, 2021 · Feb 16, 2021 · Feb 16, 2021
diff --git a/lexer/tokenized_buffer.cpp b/lexer/tokenized_buffer.cpp
@@ -6,11 +6,13 @@
 
 #include <algorithm>
 #include <cmath>
+#include <iterator>
 #include <string>
 
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -20,7 +22,17 @@ namespace Carbon {
 
 static auto TakeLeadingIntegerLiteral(llvm::StringRef source_text)
     -> llvm::StringRef {
-  return source_text.take_while([](char c) { return llvm::isDigit(c); });
+  if (source_text.empty() || !llvm::isDigit(source_text.front()))
+    return llvm::StringRef();
+
+  // Greedily consume all following characters that might be part of an integer
+  // literal. This allows us to produce better diagnostics on invalid literals.
+  //
+  // TODO(zygoloid): Update lexical rules to specify that an integer literal
+  // cannot be immediately followed by another integer literal or a word.
+  return source_text.take_while([](char c) {
+    return llvm::isAlnum(c) || c == '_';
+  });
 }
 
 struct UnmatchedClosing {
@@ -45,6 +57,79 @@ struct MismatchedClosing {
   }
 };
 
+struct EmptyDigitSequence {
+  static constexpr llvm::StringLiteral ShortName =
+      "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Empty digit sequence in numeric literal.";
+
+  struct Substitutions {
+  };
+  static auto Format(const Substitutions&) -> std::string {
+    return Message.str();
+  }
+};
+
+struct InvalidDigit {
+  static constexpr llvm::StringLiteral ShortName =
+      "syntax-invalid-number";
+
+  struct Substitutions {
+    char digit;
+    unsigned radix;
+  };
+  static auto Format(const Substitutions &subst) -> std::string {
+    char digit_str[] = {subst.digit, '\0'};
+    return (llvm::Twine("Invalid digit '") + digit_str + "' in " +
+            (subst.radix == 2 ? "binary"
+                              : subst.radix == 16 ? "hexadecimal" : "decimal") +
+            " numeric literal.")
+        .str();
+  }
+};
+
+struct InvalidDigitSeparator {
+  static constexpr llvm::StringLiteral ShortName =
+      "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Misplaced digit separator in numeric literal.";
+
+  struct Substitutions {
+  };
+  static auto Format(const Substitutions&) -> std::string {
+    return Message.str();
+  }
+};
+
+struct IrregularDigitSeparators {
+  static constexpr llvm::StringLiteral ShortName =
+      "syntax-irregular-digit-separators";
+
+  struct Substitutions {
+    unsigned radix;
+  };
+  static auto Format(const Substitutions &subst) -> std::string {
+    assert((subst.radix == 10 || subst.radix == 16) && "unexpected radix");
+    return (llvm::Twine("Digit separators in ") +
+            (subst.radix == 10 ? "decimal" : "hexadecimal") +
+            " should appear every " + (subst.radix == 10 ? "3" : "4") +
+            " characters from the right.")
+        .str();
+  }
+};
+
+struct UnknownBaseSpecifier {
+  static constexpr llvm::StringLiteral ShortName =
+      "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Unknown base specifier in numeric literal.";
+
+  struct Substitutions {};
+  static auto Format(const Substitutions&) -> std::string {
+    return Message.str();
+  }
+};
+
 struct UnrecognizedCharacters {
   static constexpr llvm::StringLiteral ShortName =
       "syntax-unrecognized-characters";
@@ -153,15 +238,91 @@ class TokenizedBuffer::Lexer {
     return false;
   }
 
+  struct CheckDigitSequenceResult {
+    bool ok;
+    bool has_digit_separators = false;
+  };
+
+  auto CheckDigitSequence(llvm::StringRef text, unsigned radix)
+      -> CheckDigitSequenceResult {
+    assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
+
+    if (text.empty()) {
+      emitter.EmitError<EmptyDigitSequence>(
+          [&](EmptyDigitSequence::Substitutions &) {});
+      return {.ok = false};
+    }
+
+    unsigned digit_separators = 0;
+    char max_decimal = (radix == 2) ? '1' : '9';
+
+    for (auto it = text.begin(), end = text.end(); it != end; ++it) {
+      char c = *it;
+      if ((c >= '0' && c <= max_decimal) ||
+          (radix == 16 && c >= 'A' && c <= 'Z')) {
+        continue;
+      }
+
+      if (c == '_') {
+        // A digit separator cannot appear at the start of a digit sequence,
+        // next to another digit separator, or at the end.
+        if (it == text.begin() || it[-1] == '_' || it + 1 == text.end()) {
+          emitter.EmitError<InvalidDigitSeparator>(
+              [&](InvalidDigitSeparator::Substitutions &) {});
+          buffer.has_errors = true;
+        }
+        ++digit_separators;
+        continue;
+      }
+
+      emitter.EmitError<InvalidDigit>(
+          [&](InvalidDigit::Substitutions &subst) {
+            subst.digit = c;
+            subst.radix = radix;
+          });
+      return {.ok = false};
+    }
+
+    if (!digit_separators)
+      return {.ok = true};
+
+    // For decimal and hexadecimal digit sequences, digit separators must form
+    // groups of 3 or 4 digits (4 or 5 characters), respectively.
+    if (radix != 2) {
+      // Check for digit separators in the expected positions.
+      unsigned stride = (radix == 10 ? 4 : 5);
+      for (auto pos = text.end(); pos - text.begin() >= stride; /*in loop*/) {
+        pos -= stride;
+        if (*pos != '_') {
+          emitter.EmitError<IrregularDigitSeparators>(
+              [&](IrregularDigitSeparators::Substitutions &subst) {
+                subst.radix = radix;
+              });
+          buffer.has_errors = true;
+          digit_separators = 0;
+          break;
+        }
+        --digit_separators;
+      }
+
+      // Check there weren't any other digit separators.
+      if (digit_separators) {
+        emitter.EmitError<IrregularDigitSeparators>(
+            [&](IrregularDigitSeparators::Substitutions &subst) {
+              subst.radix = radix;
+            });
+        buffer.has_errors = true;
+      }
+    }
+
+    return {.ok = true, .has_digit_separators = true};
+  }
+
   auto LexIntegerLiteral(llvm::StringRef& source_text) -> bool {
     llvm::StringRef int_text = TakeLeadingIntegerLiteral(source_text);
     if (int_text.empty()) {
       return false;
     }
-    llvm::APInt int_value;
-    if (int_text.getAsInteger(/*Radix=*/0, int_value)) {
-      return false;
-    }
 
     int int_column = current_column;
     current_column += int_text.size();
@@ -171,6 +332,55 @@ class TokenizedBuffer::Lexer {
       current_line_info->indent = int_column;
       set_indent = true;
     }
+
+    auto add_error_token = [&] {
+      buffer.AddToken({
+          .kind = TokenKind::Error(),
+          .token_line = current_line,
+          .column = int_column,
+          .error_length = static_cast<int32_t>(int_text.size()),
+      });
+      buffer.has_errors = true;
+    };
+
+    unsigned radix = 10;
+    llvm::StringRef digits = int_text;
+    if (int_text.size() >= 2 && int_text[0] == '0') {
+      if (int_text[1] == 'x') {
+        radix = 16;
+        digits = digits.drop_front(2);
+      } else if (int_text[1] == 'b') {
+        radix = 2;
+        digits = digits.drop_front(2);
+      } else {
+        emitter.EmitError<UnknownBaseSpecifier>(
+            [&](UnknownBaseSpecifier::Substitutions &subst) {});
+        add_error_token();
+        return true;
+      }
+    }
+
+    llvm::APInt int_value;
+
+    if (auto result = CheckDigitSequence(digits, radix); !result.ok) {
+      add_error_token();
+      return true;
+    } else if (result.has_digit_separators) {
+      // TODO(zygoloid): Avoid the memory allocation here.
+      std::string cleaned;
+      cleaned.reserve(digits.size());
+      std::remove_copy_if(digits.begin(), digits.end(),
+                          std::back_inserter(cleaned),
+                          [](char c) { return c == '_'; });
+      if (llvm::StringRef(cleaned).getAsInteger(radix, int_value)) {
+        llvm_unreachable("should never fail");
+      }
+    } else {
+      if (digits.getAsInteger(radix, int_value)) {
+        llvm_unreachable("should never fail");
+      }
+    }
+
     auto token = buffer.AddToken({.kind = TokenKind::IntegerLiteral(),
                                   .token_line = current_line,
                                   .column = int_column});
@@ -417,8 +627,8 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
     return source->Text().slice(token_start, token_stop);
   }
 
-  // Refer back to the source text to preserve oddities like radix or leading
-  // 0's the author had.
+  // Refer back to the source text to preserve oddities like radix or digit
+  // separators the author included.
   if (token_info.kind == TokenKind::IntegerLiteral()) {
     auto& line_info = GetLineInfo(token_info.token_line);
     int64_t token_start = line_info.start + token_info.column;

diff --git a/lexer/tokenized_buffer_test.cpp b/lexer/tokenized_buffer_test.cpp
@@ -77,7 +77,7 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
 }
 
 TEST_F(LexerTest, HandlesIntegerLiteral) {
-  auto buffer = Lex("12-578\n  1  2");
+  auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567");
   EXPECT_FALSE(buffer.HasErrors());
   ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
                           {.kind = TokenKind::IntegerLiteral(),
@@ -104,6 +104,21 @@ TEST_F(LexerTest, HandlesIntegerLiteral) {
                            .column = 6,
                            .indent_column = 3,
                            .text = "2"},
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 3,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = "0x12_3ABC"},
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 4,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = "0b10_10_11"},
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 5,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = "1_234_567"},
                       }));
   auto token_12 = buffer.Tokens().begin();
   EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
@@ -113,6 +128,81 @@ TEST_F(LexerTest, HandlesIntegerLiteral) {
   EXPECT_EQ(buffer.GetIntegerLiteral(*token_1), 1);
   auto token_2 = buffer.Tokens().begin() + 4;
   EXPECT_EQ(buffer.GetIntegerLiteral(*token_2), 2);
+  auto token_0x12_3abc = buffer.Tokens().begin() + 5;
+  EXPECT_EQ(buffer.GetIntegerLiteral(*token_0x12_3abc), 0x12'3abc);
+  auto token_0b10_10_11 = buffer.Tokens().begin() + 6;
+  EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
+  auto token_1_234_567 = buffer.Tokens().begin() + 7;
+  EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
+}
+
+TEST_F(LexerTest, ValidatesBaseSpecifier) {
+  llvm::StringLiteral valid[] = {
+    "0", "1", "123456789000000000000000000000000000000000000", //
+    "0x0123456789ABCDEF", "0x0000000000000000000000000000000", //
+    "0b10110100101001010", "0b0000000"
+  };
+  for (llvm::StringLiteral literal : valid) {
+    auto buffer = Lex(literal);
+    EXPECT_FALSE(buffer.HasErrors()) << literal;
+    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 1,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = literal}}));
+  }
+
+  llvm::StringLiteral invalid[] = {
+    "00", "0X123", "0o123", "0B1", "007", "123L", "123456789A", "0x", "0b",
+    "0x123abc", "0b011101201001", "0b10A"
+  };
+  for (llvm::StringLiteral literal : invalid) {
+    auto buffer = Lex(literal);
+    EXPECT_TRUE(buffer.HasErrors()) << literal;
+    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                          {.kind = TokenKind::Error(),
+                           .line = 1,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = literal}}));
+  }
+}
+
+TEST_F(LexerTest, ValidatesIntegerDigitSeparators) {
+  llvm::StringLiteral valid[] = {
+      "1_234",         "123_456",     "1_234_567",     //
+      "0x1_0000",      "0x1000_0000", "0x1_0000_0000", //
+      "0b1_0_1_0_1_0", "0b111_0000",
+  };
+  for (llvm::StringLiteral literal : valid) {
+    auto buffer = Lex(literal);
+    EXPECT_FALSE(buffer.HasErrors()) << literal;
+    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 1,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = literal}}));
+  }
+
+  llvm::StringLiteral invalid[] = {
+      "12_34",    "123_4_6_789", "12_3456_789", "12__345",     "1_",         //
+      "0x_1234",  "0x123_",      "0x12_3",      "0x_234_5678", "0x1234_567", //
+      "0b_10101", "0b1__01",     "0b1011_",     "0b1_01_01_",
+  };
+  for (llvm::StringLiteral literal : invalid) {
+    auto buffer = Lex(literal);
+    EXPECT_TRUE(buffer.HasErrors()) << literal;
+    // We expect to produce a token even for a literal containing invalid digit
+    // separators, for better error recovery.
+    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 1,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = literal}}));
+  }
 }
 
 TEST_F(LexerTest, HandlesGarbageCharacters) {