From 9d14df468e61dfd27fce6c4b4e9b7c2119c03120 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Tue, 16 Jul 2024 14:33:29 +0200
Subject: [PATCH] [clang] Inject tokens containing #embed back into token
 stream (#97274)

Summary:
Instead of playing "whack a mole" with places where #embed should be
expanded as comma-separated list, just inject each byte as a token back
into the stream, separated by commas.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:


Differential Revision: https://phabricator.intern.facebook.com/D59822402
---
 clang/include/clang/Basic/TokenKinds.def    |  3 ++
 clang/include/clang/Basic/TokenKinds.h      |  2 +-
 clang/include/clang/Lex/Preprocessor.h      |  7 +--
 clang/include/clang/Parse/Parser.h          |  3 +-
 clang/lib/Parse/ParseExpr.cpp               | 55 ++++++++++-----------
 clang/lib/Parse/ParseTemplate.cpp           | 41 +++++----------
 clang/lib/Sema/SemaExpr.cpp                 |  4 +-
 clang/test/Preprocessor/embed_codegen.cpp   |  3 +-
 clang/test/Preprocessor/embed_constexpr.cpp |  3 +-
 clang/test/Preprocessor/embed_weird.cpp     | 21 ++++----
 10 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index e60544259090983..7f4912b9bcd961c 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -165,6 +165,9 @@ TOK(raw_identifier)      // Used only in raw lexing mode.
 // C99 6.4.4.2: Floating Constants
 TOK(numeric_constant)    // 0x123
 
+// Directly holds numerical value. Used to process C23 #embed.
+TOK(binary_data)
+
 // C99 6.4.4: Character Constants
 TOK(char_constant)       // 'a'
 TOK(wide_char_constant)  // L'b'
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index e5183a27d2bc5f6..1b133dde895876b 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -98,7 +98,7 @@ inline bool isLiteral(TokenKind K) {
   return K == tok::numeric_constant || K == tok::char_constant ||
          K == tok::wide_char_constant || K == tok::utf8_char_constant ||
          K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
-         isStringLiteral(K) || K == tok::header_name;
+         isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
 }
 
 /// Return true if this is any of tok::annot_* kinds.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index be3334b9807463c..fc7d0053f2323b6 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2123,17 +2123,18 @@ class Preprocessor {
   char
   getSpellingOfSingleCharacterNumericConstant(const Token &Tok,
                                               bool *Invalid = nullptr) const {
-    assert(Tok.is(tok::numeric_constant) &&
+    assert((Tok.is(tok::numeric_constant) || Tok.is(tok::binary_data)) &&
            Tok.getLength() == 1 && "Called on unsupported token");
     assert(!Tok.needsCleaning() && "Token can't need cleaning with length 1");
 
     // If the token is carrying a literal data pointer, just use it.
     if (const char *D = Tok.getLiteralData())
-      return *D;
+      return (Tok.getKind() == tok::binary_data) ? *D : *D - '0';
 
+    assert(Tok.is(tok::numeric_constant) && "binary data with no data");
     // Otherwise, fall back on getCharacterData, which is slower, but always
     // works.
-    return *SourceMgr.getCharacterData(Tok.getLocation(), Invalid);
+    return *SourceMgr.getCharacterData(Tok.getLocation(), Invalid) - '0';
   }
 
   /// Retrieve the name of the immediate macro expansion.
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 126fea0aef2a7f8..93e60be512aae05 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -2127,7 +2127,7 @@ class Parser : public CodeCompletionHandler {
   };
   ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo);
   ExprResult createEmbedExpr();
-  void ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs);
+  void injectEmbedTokens();
 
   //===--------------------------------------------------------------------===//
   // clang Expressions
@@ -3834,7 +3834,6 @@ class Parser : public CodeCompletionHandler {
   AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS,
                                 ImplicitTypenameContext AllowImplicitTypename,
                                 bool IsClassName = false);
-  void ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs);
   bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
                                  TemplateTy Template, SourceLocation OpenLoc);
   ParsedTemplateArgument ParseTemplateTemplateArgument();
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 259bfa1d77cdb2d..3d7c58e5b3c3cdd 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1099,6 +1099,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
 
     // primary-expression
   case tok::numeric_constant:
+  case tok::binary_data:
     // constant: integer-constant
     // constant: floating-constant
 
@@ -1148,18 +1149,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
   }
 
   case tok::annot_embed: {
-    // We've met #embed in a context where a single value is expected. Take last
-    // element from #embed data as if it were a comma expression.
-    EmbedAnnotationData *Data =
-        reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
-    SourceLocation StartLoc = ConsumeAnnotationToken();
-    ASTContext &Context = Actions.getASTContext();
-    Res = IntegerLiteral::Create(Context,
-                                 llvm::APInt(CHAR_BIT, Data->BinaryData.back()),
-                                 Context.UnsignedCharTy, StartLoc);
-    if (Data->BinaryData.size() > 1)
-      Diag(StartLoc, diag::warn_unused_comma_left_operand);
-    break;
+    injectEmbedTokens();
+    return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+                               isVectorLiteral, NotPrimaryExpression);
   }
 
   case tok::kw___super:
@@ -3584,15 +3576,29 @@ ExprResult Parser::ParseFoldExpression(ExprResult LHS,
                                   T.getCloseLocation());
 }
 
-void Parser::ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs) {
+void Parser::injectEmbedTokens() {
   EmbedAnnotationData *Data =
       reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
-  SourceLocation StartLoc = ConsumeAnnotationToken();
-  ASTContext &Context = Actions.getASTContext();
-  for (auto Byte : Data->BinaryData) {
-    Exprs.push_back(IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
-                                           Context.UnsignedCharTy, StartLoc));
-  }
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(
+                                  Data->BinaryData.size() * 2 - 1),
+                              Data->BinaryData.size() * 2 - 1);
+  unsigned I = 0;
+  for (auto &Byte : Data->BinaryData) {
+    Toks[I].startToken();
+    Toks[I].setKind(tok::binary_data);
+    Toks[I].setLocation(Tok.getLocation());
+    Toks[I].setLength(1);
+    Toks[I].setLiteralData(&Byte);
+    if (I != ((Data->BinaryData.size() - 1) * 2)) {
+      Toks[I + 1].startToken();
+      Toks[I + 1].setKind(tok::comma);
+      Toks[I + 1].setLocation(Tok.getLocation());
+    }
+    I += 2;
+  }
+  PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true,
+                      /*IsReinject=*/false);
+  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
 }
 
 /// ParseExpressionList - Used for C/C++ (argument-)expression-list.
@@ -3630,17 +3636,8 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) {
       Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists);
       Expr = ParseBraceInitializer();
-    } else if (Tok.is(tok::annot_embed)) {
-      ExpandEmbedDirective(Exprs);
-      if (Tok.isNot(tok::comma))
-        break;
-      Token Comma = Tok;
-      ConsumeToken();
-      checkPotentialAngleBracketDelimiter(Comma);
-      continue;
-    } else {
+    } else
       Expr = ParseAssignmentExpression();
-    }
 
     if (EarlyTypoCorrection)
       Expr = Actions.CorrectDelayedTyposInExpr(Expr);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index 7e30afa2c64a4ff..a5130f56600e54b 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -1523,19 +1523,6 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() {
                                 ExprArg.get(), Loc);
 }
 
-void Parser::ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs) {
-  EmbedAnnotationData *Data =
-      reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
-  SourceLocation StartLoc = ConsumeAnnotationToken();
-  ASTContext &Context = Actions.getASTContext();
-  for (auto Byte : Data->BinaryData) {
-    Expr *E = IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
-                                     Context.UnsignedCharTy, StartLoc);
-    TemplateArgs.push_back(
-        ParsedTemplateArgument(ParsedTemplateArgument::NonType, E, StartLoc));
-  }
-}
-
 /// ParseTemplateArgumentList - Parse a C++ template-argument-list
 /// (C++ [temp.names]). Returns true if there was an error.
 ///
@@ -1560,24 +1547,20 @@ bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
 
   do {
     PreferredType.enterFunctionArgument(Tok.getLocation(), RunSignatureHelp);
-    if (Tok.is(tok::annot_embed)) {
-      ExpandEmbedIntoTemplateArgList(TemplateArgs);
-    } else {
-      ParsedTemplateArgument Arg = ParseTemplateArgument();
-      SourceLocation EllipsisLoc;
-      if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
-        Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
-
-      if (Arg.isInvalid()) {
-        if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
-          RunSignatureHelp();
-        return true;
-      }
-
-      // Save this template argument.
-      TemplateArgs.push_back(Arg);
+    ParsedTemplateArgument Arg = ParseTemplateArgument();
+    SourceLocation EllipsisLoc;
+    if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
+      Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
+
+    if (Arg.isInvalid()) {
+      if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
+        RunSignatureHelp();
+      return true;
     }
 
+    // Save this template argument.
+    TemplateArgs.push_back(Arg);
+
     // If the next token is a comma, consume it and keep reading
     // arguments.
   } while (TryConsumeToken(tok::comma));
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index b0f953ea0a13a24..0698c3fbe98d298 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3643,9 +3643,9 @@ bool Sema::CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero) {
 ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
   // Fast path for a single digit (which is quite common).  A single digit
   // cannot have a trigraph, escaped newline, radix prefix, or suffix.
-  if (Tok.getLength() == 1) {
+  if (Tok.getLength() == 1 || Tok.getKind() == tok::binary_data) {
     const char Val = PP.getSpellingOfSingleCharacterNumericConstant(Tok);
-    return ActOnIntegerConstant(Tok.getLocation(), Val-'0');
+    return ActOnIntegerConstant(Tok.getLocation(), Val);
   }
 
   SmallString<128> SpellingBuffer;
diff --git a/clang/test/Preprocessor/embed_codegen.cpp b/clang/test/Preprocessor/embed_codegen.cpp
index 64110afc162d726..201bf300bc66942 100644
--- a/clang/test/Preprocessor/embed_codegen.cpp
+++ b/clang/test/Preprocessor/embed_codegen.cpp
@@ -43,8 +43,9 @@ a
 };
 
 // CHECK: store i32 107, ptr %b, align 4
-int b =
+int b = (
 #embed<jk.txt>
+    )
 ;
 
 
diff --git a/clang/test/Preprocessor/embed_constexpr.cpp b/clang/test/Preprocessor/embed_constexpr.cpp
index 1cadff76b4890ad..a7857641a2e8df7 100644
--- a/clang/test/Preprocessor/embed_constexpr.cpp
+++ b/clang/test/Preprocessor/embed_constexpr.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -Wno-c23-extensions
 // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -fexperimental-new-constant-interpreter -Wno-c23-extensions
+// expected-no-diagnostics
 
 constexpr int value(int a, int b) {
   return a + b;
@@ -46,7 +47,7 @@ int array[
 static_assert(sizeof(array) / sizeof(int) == 'j');
 
 constexpr int comma_expr = (
-#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
+#embed <jk.txt>
 );
 static_assert(comma_expr == 'k');
 
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index 31b622c848d6a98..cc73a88e5a657be 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -27,7 +27,7 @@ _Static_assert(
 _Static_assert(sizeof(
 #embed <single_byte.txt>
 ) ==
-sizeof(unsigned char)
+sizeof(int)
 , ""
 );
 _Static_assert(sizeof
@@ -35,9 +35,9 @@ _Static_assert(sizeof
 , ""
 );
 _Static_assert(sizeof(
-#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
+#embed <jk.txt>
 ) ==
-sizeof(unsigned char)
+sizeof(int)
 , ""
 );
 
@@ -73,10 +73,10 @@ void do_stuff() {
 // Ensure that we don't accidentally allow you to initialize an unsigned char *
 // from embedded data; the data is modeled as a string literal internally, but
 // is not actually a string literal.
-const unsigned char *ptr =
+const unsigned char *ptr = (
 #embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
-; // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'unsigned char'}} \
-     cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'unsigned char'}}
+    ); // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'int'}} \
+     cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'int'}}
 
 // However, there are some cases where this is fine and should work.
 const unsigned char *null_ptr_1 =
@@ -101,11 +101,10 @@ constexpr unsigned char ch =
 ;
 static_assert(ch == 0);
 
-void foobar(float x, char y, char z); // cxx-note {{candidate function not viable: requires 3 arguments, but 1 was provided}}
-                                      // c-note@-1 {{declared here}}
-void g1() { foobar((float) // cxx-error {{no matching function for call to 'foobar'}}
-#embed "numbers.txt" limit(3) // expected-warning {{left operand of comma operator has no effect}}
-); // c-error {{too few arguments to function call, expected 3, have 1}}
+void foobar(float x, char y, char z);
+void g1() { foobar((float)
+#embed "numbers.txt" limit(3)
+);
 }
 
 #if __cplusplus