[clang] Inject tokens containing #embed back into token stream #97274

Fznamznon · 2024-07-01T10:06:06Z

Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.

llvmbot · 2024-07-01T10:06:36Z

@llvm/pr-subscribers-clang

Author: Mariya Podchishchaeva (Fznamznon)

Changes

Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.

Full diff: https://github.com/llvm/llvm-project/pull/97274.diff

10 Files Affected:

(modified) clang/include/clang/Basic/TokenKinds.def (+3)
(modified) clang/include/clang/Basic/TokenKinds.h (+1-1)
(modified) clang/include/clang/Lex/Preprocessor.h (+3-2)
(modified) clang/include/clang/Parse/Parser.h (+1-2)
(modified) clang/lib/Parse/ParseExpr.cpp (+25-28)
(modified) clang/lib/Parse/ParseTemplate.cpp (+12-29)
(modified) clang/lib/Sema/SemaExpr.cpp (+4-2)
(modified) clang/test/Preprocessor/embed_codegen.cpp (+2-1)
(modified) clang/test/Preprocessor/embed_constexpr.cpp (+2-1)
(modified) clang/test/Preprocessor/embed_weird.cpp (+10-11)

diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 37d570ca5e75b..1bc9c59576f33 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -165,6 +165,9 @@ TOK(raw_identifier)      // Used only in raw lexing mode.
 // C99 6.4.4.2: Floating Constants
 TOK(numeric_constant)    // 0x123
 
+// Directly holds numerical value. Used to process C23 #embed.
+TOK(binary_data)
+
 // C99 6.4.4: Character Constants
 TOK(char_constant)       // 'a'
 TOK(wide_char_constant)  // L'b'
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index e5183a27d2bc5..1b133dde89587 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -98,7 +98,7 @@ inline bool isLiteral(TokenKind K) {
   return K == tok::numeric_constant || K == tok::char_constant ||
          K == tok::wide_char_constant || K == tok::utf8_char_constant ||
          K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
-         isStringLiteral(K) || K == tok::header_name;
+         isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
 }
 
 /// Return true if this is any of tok::annot_* kinds.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index be3334b980746..8e30756da2a01 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2123,8 +2123,9 @@ class Preprocessor {
   char
   getSpellingOfSingleCharacterNumericConstant(const Token &Tok,
                                               bool *Invalid = nullptr) const {
-    assert(Tok.is(tok::numeric_constant) &&
-           Tok.getLength() == 1 && "Called on unsupported token");
+    assert((Tok.is(tok::numeric_constant) || Tok.is(tok::binary_data)) &&
+                                                Tok.getLength() == 1 &&
+                                                "Called on unsupported token");
     assert(!Tok.needsCleaning() && "Token can't need cleaning with length 1");
 
     // If the token is carrying a literal data pointer, just use it.
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 6880fa4bb0b03..7bc2280764c5b 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -2123,7 +2123,7 @@ class Parser : public CodeCompletionHandler {
   };
   ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo);
   ExprResult createEmbedExpr();
-  void ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs);
+  void injectEmbedTokens();
 
   //===--------------------------------------------------------------------===//
   // clang Expressions
@@ -3830,7 +3830,6 @@ class Parser : public CodeCompletionHandler {
   AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS,
                                 ImplicitTypenameContext AllowImplicitTypename,
                                 bool IsClassName = false);
-  void ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs);
   bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
                                  TemplateTy Template, SourceLocation OpenLoc);
   ParsedTemplateArgument ParseTemplateTemplateArgument();
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 9fc3cd73f73a0..a3b800a35b55e 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
 
     // primary-expression
   case tok::numeric_constant:
+  case tok::binary_data:
     // constant: integer-constant
     // constant: floating-constant
 
@@ -1067,18 +1068,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
   }
 
   case tok::annot_embed: {
-    // We've met #embed in a context where a single value is expected. Take last
-    // element from #embed data as if it were a comma expression.
-    EmbedAnnotationData *Data =
-        reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
-    SourceLocation StartLoc = ConsumeAnnotationToken();
-    ASTContext &Context = Actions.getASTContext();
-    Res = IntegerLiteral::Create(Context,
-                                 llvm::APInt(CHAR_BIT, Data->BinaryData.back()),
-                                 Context.UnsignedCharTy, StartLoc);
-    if (Data->BinaryData.size() > 1)
-      Diag(StartLoc, diag::warn_unused_comma_left_operand);
-    break;
+    injectEmbedTokens();
+    return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+                               isVectorLiteral, NotPrimaryExpression);
   }
 
   case tok::kw___super:
@@ -3578,15 +3570,29 @@ ExprResult Parser::ParseFoldExpression(ExprResult LHS,
                                   T.getCloseLocation());
 }
 
-void Parser::ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs) {
+void Parser::injectEmbedTokens() {
   EmbedAnnotationData *Data =
       reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
-  SourceLocation StartLoc = ConsumeAnnotationToken();
-  ASTContext &Context = Actions.getASTContext();
-  for (auto Byte : Data->BinaryData) {
-    Exprs.push_back(IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
-                                           Context.UnsignedCharTy, StartLoc));
+  MutableArrayRef<Token> Toks(
+      PP.getPreprocessorAllocator().Allocate<Token>(Data->BinaryData.size() * 2 - 1),
+      Data->BinaryData.size() * 2 - 1);
+  unsigned I = 0;
+  for (auto &Byte : Data->BinaryData) {
+    Toks[I].startToken();
+    Toks[I].setKind(tok::binary_data);
+    Toks[I].setLocation(Tok.getLocation());
+    Toks[I].setLength(1);
+    Toks[I].setLiteralData(&Byte);
+    if (I != ((Data->BinaryData.size() - 1) * 2)) {
+      Toks[I + 1].startToken();
+      Toks[I + 1].setKind(tok::comma);
+      Toks[I + 1].setLocation(Tok.getLocation());
+    }
+    I += 2;
   }
+  PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true,
+                      /*IsReinject=*/false);
+  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
 }
 
 /// ParseExpressionList - Used for C/C++ (argument-)expression-list.
@@ -3624,17 +3630,8 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) {
       Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists);
       Expr = ParseBraceInitializer();
-    } else if (Tok.is(tok::annot_embed)) {
-      ExpandEmbedDirective(Exprs);
-      if (Tok.isNot(tok::comma))
-        break;
-      Token Comma = Tok;
-      ConsumeToken();
-      checkPotentialAngleBracketDelimiter(Comma);
-      continue;
-    } else {
+    } else
       Expr = ParseAssignmentExpression();
-    }
 
     if (EarlyTypoCorrection)
       Expr = Actions.CorrectDelayedTyposInExpr(Expr);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index 7e30afa2c64a4..a5130f56600e5 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -1523,19 +1523,6 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() {
                                 ExprArg.get(), Loc);
 }
 
-void Parser::ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs) {
-  EmbedAnnotationData *Data =
-      reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
-  SourceLocation StartLoc = ConsumeAnnotationToken();
-  ASTContext &Context = Actions.getASTContext();
-  for (auto Byte : Data->BinaryData) {
-    Expr *E = IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
-                                     Context.UnsignedCharTy, StartLoc);
-    TemplateArgs.push_back(
-        ParsedTemplateArgument(ParsedTemplateArgument::NonType, E, StartLoc));
-  }
-}
-
 /// ParseTemplateArgumentList - Parse a C++ template-argument-list
 /// (C++ [temp.names]). Returns true if there was an error.
 ///
@@ -1560,24 +1547,20 @@ bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
 
   do {
     PreferredType.enterFunctionArgument(Tok.getLocation(), RunSignatureHelp);
-    if (Tok.is(tok::annot_embed)) {
-      ExpandEmbedIntoTemplateArgList(TemplateArgs);
-    } else {
-      ParsedTemplateArgument Arg = ParseTemplateArgument();
-      SourceLocation EllipsisLoc;
-      if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
-        Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
-
-      if (Arg.isInvalid()) {
-        if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
-          RunSignatureHelp();
-        return true;
-      }
-
-      // Save this template argument.
-      TemplateArgs.push_back(Arg);
+    ParsedTemplateArgument Arg = ParseTemplateArgument();
+    SourceLocation EllipsisLoc;
+    if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
+      Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
+
+    if (Arg.isInvalid()) {
+      if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
+        RunSignatureHelp();
+      return true;
     }
 
+    // Save this template argument.
+    TemplateArgs.push_back(Arg);
+
     // If the next token is a comma, consume it and keep reading
     // arguments.
   } while (TryConsumeToken(tok::comma));
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index db44cfe1288b6..c5657b2389cd2 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3722,9 +3722,11 @@ bool Sema::CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero) {
 ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
   // Fast path for a single digit (which is quite common).  A single digit
   // cannot have a trigraph, escaped newline, radix prefix, or suffix.
-  if (Tok.getLength() == 1) {
+  if (Tok.getLength() == 1 || Tok.getKind() == tok::binary_data) {
     const char Val = PP.getSpellingOfSingleCharacterNumericConstant(Tok);
-    return ActOnIntegerConstant(Tok.getLocation(), Val-'0');
+    return ActOnIntegerConstant(
+        Tok.getLocation(),
+        (Tok.getKind() == tok::binary_data) ? Val : Val - '0');
   }
 
   SmallString<128> SpellingBuffer;
diff --git a/clang/test/Preprocessor/embed_codegen.cpp b/clang/test/Preprocessor/embed_codegen.cpp
index 64110afc162d7..201bf300bc669 100644
--- a/clang/test/Preprocessor/embed_codegen.cpp
+++ b/clang/test/Preprocessor/embed_codegen.cpp
@@ -43,8 +43,9 @@ a
 };
 
 // CHECK: store i32 107, ptr %b, align 4
-int b =
+int b = (
 #embed<jk.txt>
+    )
 ;
 
 
diff --git a/clang/test/Preprocessor/embed_constexpr.cpp b/clang/test/Preprocessor/embed_constexpr.cpp
index 1cadff76b4890..a7857641a2e8d 100644
--- a/clang/test/Preprocessor/embed_constexpr.cpp
+++ b/clang/test/Preprocessor/embed_constexpr.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -Wno-c23-extensions
 // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -fexperimental-new-constant-interpreter -Wno-c23-extensions
+// expected-no-diagnostics
 
 constexpr int value(int a, int b) {
   return a + b;
@@ -46,7 +47,7 @@ int array[
 static_assert(sizeof(array) / sizeof(int) == 'j');
 
 constexpr int comma_expr = (
-#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
+#embed <jk.txt>
 );
 static_assert(comma_expr == 'k');
 
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index 31b622c848d6a..cc73a88e5a657 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -27,7 +27,7 @@ _Static_assert(
 _Static_assert(sizeof(
 #embed <single_byte.txt>
 ) ==
-sizeof(unsigned char)
+sizeof(int)
 , ""
 );
 _Static_assert(sizeof
@@ -35,9 +35,9 @@ _Static_assert(sizeof
 , ""
 );
 _Static_assert(sizeof(
-#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
+#embed <jk.txt>
 ) ==
-sizeof(unsigned char)
+sizeof(int)
 , ""
 );
 
@@ -73,10 +73,10 @@ void do_stuff() {
 // Ensure that we don't accidentally allow you to initialize an unsigned char *
 // from embedded data; the data is modeled as a string literal internally, but
 // is not actually a string literal.
-const unsigned char *ptr =
+const unsigned char *ptr = (
 #embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
-; // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'unsigned char'}} \
-     cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'unsigned char'}}
+    ); // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'int'}} \
+     cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'int'}}
 
 // However, there are some cases where this is fine and should work.
 const unsigned char *null_ptr_1 =
@@ -101,11 +101,10 @@ constexpr unsigned char ch =
 ;
 static_assert(ch == 0);
 
-void foobar(float x, char y, char z); // cxx-note {{candidate function not viable: requires 3 arguments, but 1 was provided}}
-                                      // c-note@-1 {{declared here}}
-void g1() { foobar((float) // cxx-error {{no matching function for call to 'foobar'}}
-#embed "numbers.txt" limit(3) // expected-warning {{left operand of comma operator has no effect}}
-); // c-error {{too few arguments to function call, expected 3, have 1}}
+void foobar(float x, char y, char z);
+void g1() { foobar((float)
+#embed "numbers.txt" limit(3)
+);
 }
 
 #if __cplusplus

github-actions · 2024-07-01T10:10:02Z

✅ With the latest revision this PR passed the C/C++ code formatter.

AaronBallman · 2024-07-01T14:06:32Z

CC @jakubjelinek for awareness

AaronBallman

In general, I like this! LGTM, though please give others a chance to review before landing.

clang/test/Preprocessor/embed_codegen.cpp

Fznamznon · 2024-07-08T07:42:07Z

It's been a week. Are we good to go? @cor3ntin ?

cor3ntin · 2024-07-08T13:13:31Z

clang/lib/Parse/ParseExpr.cpp

@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,

    // primary-expression
  case tok::numeric_constant:


Can you adapt the couple other places where we parse a numeric constant?

Could you please elaborate which other places you meant? I briefly examined all mentions of tok::numeric_constant and ActOnNumericConstant in lib/Parse and modifying any of these doesn't seem to be reasonable for #embed support.

ping @cor3ntin

These cases

[[clang::availability( #embed "smth.txt" )]] void f(); struct S { virtual void f() = #embed "zero.bin" ; };

However, it's a bit ambiguous whether these should be supported at all in the proposed C++ wording for embed. I sent a mail to the C++ committee

It's probably fine to ignore that for now, the patch is a great improvement as it is

jakubjelinek · 2024-07-09T13:58:31Z

I'd like to point out that most of the testcases I wrote for the GCC implementation still fail with the latest clang on godbolt.

Fznamznon · 2024-07-09T14:41:09Z

I'd like to point out that most of the testcases I wrote for the GCC implementation still fail with the latest clang on godbolt.

This patch should help with at least some of them.

jakubjelinek · 2024-07-09T14:53:24Z

Maybe, but I've add quite a few new testcases too...

AaronBallman · 2024-07-09T19:04:34Z

I'd like to point out that most of the testcases I wrote for the GCC implementation still fail with the latest clang on godbolt.

This patch should help with at least some of them.

It'd be good to verify that it does help with them or see if there are other issues we're not yet tracking (but please do not copy the tests from the GCC implementation into Clang because of licensing).

AaronBallman · 2024-07-09T19:05:42Z

I'd like to point out that most of the testcases I wrote for the GCC implementation still fail with the latest clang on godbolt.

Thank you for letting us know! The plan is to continue to polish this feature through the Clang 19 release. The branch date for us is in two weeks, so the more we can get done before the branch, the better.

cor3ntin · 2024-07-11T08:58:34Z

clang/lib/Sema/SemaExpr.cpp

-    return ActOnIntegerConstant(Tok.getLocation(), Val-'0');
+    return ActOnIntegerConstant(
+        Tok.getLocation(),
+        (Tok.getKind() == tok::binary_data) ? Val : Val - '0');


can we move that to PP.getSpellingOfSingleCharacterNumericConstant?

cor3ntin · 2024-07-11T09:02:24Z

clang/lib/Parse/ParseExpr.cpp

@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,

    // primary-expression
  case tok::numeric_constant:


These cases

[[clang::availability( #embed "smth.txt" )]] void f(); struct S { virtual void f() = #embed "zero.bin" ; };

However, it's a bit ambiguous whether these should be supported at all in the proposed C++ wording for embed. I sent a mail to the C++ committee

It's probably fine to ignore that for now, the patch is a great improvement as it is

Fznamznon · 2024-07-15T14:46:46Z

I'm looking through https://gcc.gnu.org/pipermail/gcc-patches/2024-June/655012.html
This patch fixes cases like:

const unsigned char w[] = {
#embed __FILE__ prefix([0] = 42, [15] =) limit(32)
};

And also cases like

void foo (int, int, int, int);
void bar (void) { foo (
#embed __FILE__ limit (4) prefix (172 + ) suffix (+ 2)
); }

as intended, the test case similar to latter is modified by the patch.
There are still problems with #embed "" __if_empty__ ((({{[0[0{0{0(0(0)1)1}1}]]}})))), has_embed, handling of prefix/suffix trying to #embed "", https://godbolt.org/z/PsoTr5f7K and etc. These I consider out of the scope of this patch and will post more details to #95222 .

jakubjelinek · 2024-07-15T14:54:04Z

I meant also e.g. basic tests like gcc/testsuite/c-c++-common/cpp/embed-20.c in
https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657053.html
(and various others too). Just checked that one and it still fails on latest clang trunk using godbolt.

jakubjelinek · 2024-07-16T09:14:20Z

@ThePhD @AaronBallman
And even more importantly (checking on godbolt again):

int a = sizeof (
    #embed __FILE__ limit (1)
);

is 1 in clang as well as clang++ trunk and 4 with clang/clang++ trunk with -save-temps.
I thought there was agreement that at least for C the literals have type int, for C++ it is fuzzy and depends on what will be voted in but in any case, there shouldn't be different code produced between integrated preprocessing and -save-temps.
If C++ requires some cast, it would need to make it clear what that exact cast is, i.e. whether it is supposed to be

12,143,12,16

or

static_cast<unsigned char>(12),static_cast<unsigned char>(143),static_cast<unsigned char>(12),static_cast<unsigned char>(16)

or

(unsigned char)12,(unsigned char)143,(unsigned char)12,(unsigned char)16

or

'\014','\0217','\014','\020'

or whatever else and the preprocessor would need to emit that cast on every literal (unless using some extension like #embed "." gnu::base64("...") that the GCC patchset uses for the inner parts of the longer sequences.
I think the exact type of cast can affect parsing of some of the expressions, e.g.

extern long long p[];
auto foo () {
return
#embed __FILE__ limit (1)
[p];
}

will behave one way with the cast is (unsigned char)12 and differently if it is static_cast(12) or ((unsigned char)12).
Most of the other bugs I'm seeing are also about consistency, with -save-temps it works IMHO correctly while without it misbehaves. The behavior has to be the same.

Fznamznon · 2024-07-16T11:54:58Z

Just checked that one and it still fails on latest clang trunk using godbolt.

Hmm, ok. #99023 . Thanks for pointing this out.

I'm looking into other tests as well. My point was that this particular PR solves its task and needs to proceed.
Other problems likely need another patches.

AaronBallman · 2024-07-16T12:08:42Z

I thought there was agreement that at least for C the literals have type int,

That's correct; this patch gets started on addressing that issue by updating the way we expand the embedded contents but isn't intended to fix the type issue yet (that will be a follow-up patch). When we do the follow-up, we should ensure the embedded elements are of type int (for both C and C++; we don't want different types coming out of the preprocessor depending on language mode as that will make life harder for mixed C and C++ code bases).

AaronBallman

LGTM!

ThePhD · 2024-07-16T13:42:33Z

Just to be clear, I'm editing the C++ wording to also produce integer constant expressions that should be int typed, so both languages should be in sync.

Fznamznon · 2024-07-16T15:02:57Z

FYI #99050

…97274) Summary: Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D59822402

Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.

bnbarham · 2024-08-06T02:51:59Z

clang/include/clang/Lex/Preprocessor.h

@@ -2123,17 +2123,18 @@ class Preprocessor {
  char
  getSpellingOfSingleCharacterNumericConstant(const Token &Tok,


Both the documentation and name of this function is incorrect now that the underlying functionality has changed - would you mind updating it to something more appropriate @Fznamznon? It used to return the character itself:

/// Given a Token \p Tok that is a numeric constant with length 1, /// return the character.

But now returns the value of the character.

I've updated the comment and the interface in 295e4f4

Thank you @AaronBallman

Thanks @AaronBallman! Would it be worth also changing Spelling to Value or something like that?

These changes were inspired by a post-commit review comment: #97274 (review)

The functionality of this function changed upstream in llvm/llvm-project#97274 and it now returns the value of the character, rather than the character itself.

These changes were inspired by a post-commit review comment: llvm#97274 (review)

This patch adds similar optimizations to the C++ FE as have been implemented earlier in the C FE. The libcpp hunk enables use of CPP_EMBED token even for C++, not just C; the preprocessor guarantees there is always a CPP_NUMBER CPP_COMMA before CPP_EMBED and CPP_COMMA CPP_NUMBER after it which simplifies parsing (unless #embed is more than 2GB, in that case it could be CPP_NUMBER CPP_COMMA CPP_EMBED CPP_COMMA CPP_EMBED CPP_COMMA CPP_EMBED CPP_COMMA CPP_NUMBER etc. with each CPP_EMBED covering at most INT_MAX bytes). Similarly to the C patch, this patch parses it into RAW_DATA_CST tree in the braced initializers (and from there peels into INTEGER_CSTs unless it is an initializer of an std::byte array or integral array with CHAR_BIT element precision), parses CPP_EMBED in cp_parser_expression into just the last INTEGER_CST in it because I think users don't need millions of -Wunused-value warnings because they did useless int a = ( #embed "megabyte.dat" ); and so most of the inner INTEGER_CSTs would be there just for the warning, and in the rest of contexts like template argument list, function argument list, attribute argument list, ...) parse it into a sequence of INTEGER_CSTs (I wrote a range/iterator classes to simplify that). My dumb cat embed-11.c constexpr unsigned char a[] = { #embed "cc1plus" }; const unsigned char *b = a; testcase where cc1plus is 492329008 bytes long when configured --enable-checking=yes,rtl,extra against recent binutils with .base64 gas support results in: time ./xg++ -B ./ -S -O2 embed-11.c real 0m4.350s user 0m2.427s sys 0m0.830s time ./xg++ -B ./ -c -O2 embed-11.c real 0m6.932s user 0m6.034s sys 0m0.888s (compared to running out of memory or very long compilation). On a shorter inclusion, cat embed-12.c constexpr unsigned char a[] = { #embed "xg++" }; const unsigned char *b = a; where xg++ is 15225904 bytes long, this takes using GCC with the #embed patchset except for this patch: time ~/src/gcc/obj36/gcc/xg++ -B ~/src/gcc/obj36/gcc/ -S -O2 embed-12.c real 0m33.190s user 0m32.327s sys 0m0.790s and with this patch: time ./xg++ -B ./ -S -O2 embed-12.c real 0m0.118s user 0m0.090s sys 0m0.028s The patch doesn't change anything on what the first patch in the series introduces even for C++, namely that #embed is expanded (actually or as if) into a sequence of literals like 127,69,76,70,2,1,1,3,0,0,0,0,0,0,0,0,2,0,62,0,1,0,0,0,80,211,64,0,0,0,0,0,64,0,0,0,0,0,0,0,8,253 and so each element has int type. That is how I believe it is in C23, and the different versions of the C++ P1967 paper specified there some casts, P1967R12 in particular "Otherwise, the integral constant expression is the value of std::fgetc’s return is cast to unsigned char." but please see llvm/llvm-project#97274 (comment) comment and whether we really want the preprocessor to preprocess it for C++ as (or as-if) static_cast<unsigned char>(127),static_cast<unsigned char>(69),static_cast<unsigned char>(76),static_cast<unsigned char>(70),static_cast<unsigned char>(2),... i.e. 9 tokens per byte rather than 2, or (unsigned char)127,(unsigned char)69,... or ((unsigned char)127),((unsigned char)69),... etc. Without a literal suffix for unsigned char constant literals it is horrible, plus the incompatibility between C and C++. Sure, we could use the magic form more often for C++ to save the size and do the 9 or how many tokens form only for the boundary constants and use #embed "." __gnu__::__base64__("...") for what is in between if there are at least 2 tokens inside of it. E.g. (unsigned char)127 vs. static_cast<unsigned char>(127) behaves differently if there is constexpr long long p[] = { ... }; ... #embed __FILE__ [p] 2024-12-06 Jakub Jelinek <jakub@redhat.com> libcpp/ * files.cc (finish_embed): Use CPP_EMBED even for C++. gcc/ * tree.h (RAW_DATA_UCHAR_ELT, RAW_DATA_SCHAR_ELT): Define. gcc/cp/ChangeLog: * cp-tree.h (class raw_data_iterator): New type. (class raw_data_range): New type. * parser.cc (cp_parser_postfix_open_square_expression): Handle parsing of CPP_EMBED. (cp_parser_parenthesized_expression_list): Likewise. Use cp_lexer_next_token_is. (cp_parser_expression): Handle parsing of CPP_EMBED. (cp_parser_template_argument_list): Likewise. (cp_parser_initializer_list): Likewise. (cp_parser_oacc_clause_tile): Likewise. (cp_parser_omp_tile_sizes): Likewise. * pt.cc (tsubst_expr): Handle RAW_DATA_CST. * constexpr.cc (reduced_constant_expression_p): Likewise. (raw_data_cst_elt): New function. (find_array_ctor_elt): Handle RAW_DATA_CST. (cxx_eval_array_reference): Likewise. * typeck2.cc (digest_init_r): Emit -Wnarrowing and/or -Wconversion diagnostics. (process_init_constructor_array): Handle RAW_DATA_CST. * decl.cc (maybe_deduce_size_from_array_init): Likewise. (is_direct_enum_init): Fail for RAW_DATA_CST. (cp_maybe_split_raw_data): New function. (consume_init): New function. (reshape_init_array_1): Add VECTOR_P argument. Handle RAW_DATA_CST. (reshape_init_array): Adjust reshape_init_array_1 caller. (reshape_init_vector): Likewise. (reshape_init_class): Handle RAW_DATA_CST. (reshape_init_r): Likewise. gcc/testsuite/ * c-c++-common/cpp/embed-22.c: New test. * c-c++-common/cpp/embed-23.c: New test. * g++.dg/cpp/embed-4.C: New test. * g++.dg/cpp/embed-5.C: New test. * g++.dg/cpp/embed-6.C: New test. * g++.dg/cpp/embed-7.C: New test. * g++.dg/cpp/embed-8.C: New test. * g++.dg/cpp/embed-9.C: New test. * g++.dg/cpp/embed-10.C: New test. * g++.dg/cpp/embed-11.C: New test. * g++.dg/cpp/embed-12.C: New test. * g++.dg/cpp/embed-13.C: New test. * g++.dg/cpp/embed-14.C: New test.

[clang] Inject tokens containing #embed back into token stream

4d5008f

Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.

Fznamznon requested review from cor3ntin and AaronBallman July 1, 2024 10:06

llvmbot added clang Clang issues not falling into any other category clang:frontend Language frontend issues, e.g. anything involving "Sema" labels Jul 1, 2024

Fznamznon mentioned this pull request Jul 1, 2024

[clang] Follow-up on #embed implementation #95222

Open

8 tasks

AaronBallman approved these changes Jul 1, 2024

View reviewed changes

clang/test/Preprocessor/embed_codegen.cpp Show resolved Hide resolved

Merge branch 'main' into inject-embed-tokens

d2406e0

Fix format

f34179d

cor3ntin reviewed Jul 8, 2024

View reviewed changes

cor3ntin reviewed Jul 11, 2024

View reviewed changes

Fznamznon added 2 commits July 12, 2024 06:25

Merge branch 'main' into inject-embed-tokens

1603695

Move calculation to getSpellingOfSingleCharacterNumericConstant

e9c0253

Fznamznon requested a review from cor3ntin July 15, 2024 14:46

AaronBallman approved these changes Jul 16, 2024

View reviewed changes

Fznamznon merged commit dba2e66 into llvm:main Jul 16, 2024
7 checks passed

cor3ntin mentioned this pull request Jul 16, 2024

[clang] Fix underlying type of EmbedExpr #99050

Merged

bnbarham reviewed Aug 6, 2024

View reviewed changes

bnbarham mentioned this pull request Aug 6, 2024

[ClangImporter] Correct check to ignore macro swiftlang/swift#75706

Merged

AaronBallman added a commit that referenced this pull request Aug 6, 2024

Correct a comment and update a return type; NFC

295e4f4

These changes were inspired by a post-commit review comment: #97274 (review)

banach-space pushed a commit to banach-space/llvm-project that referenced this pull request Aug 7, 2024

Correct a comment and update a return type; NFC

ff99546

These changes were inspired by a post-commit review comment: llvm#97274 (review)

kstoimenov pushed a commit to kstoimenov/llvm-project that referenced this pull request Aug 15, 2024

Correct a comment and update a return type; NFC

d2ffb17

These changes were inspired by a post-commit review comment: llvm#97274 (review)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[clang] Inject tokens containing #embed back into token stream #97274

[clang] Inject tokens containing #embed back into token stream #97274

Fznamznon commented Jul 1, 2024

llvmbot commented Jul 1, 2024

github-actions bot commented Jul 1, 2024 •

edited

Loading

AaronBallman commented Jul 1, 2024

AaronBallman left a comment

Fznamznon commented Jul 8, 2024 •

edited

Loading

cor3ntin Jul 8, 2024

Fznamznon Jul 8, 2024

Fznamznon Jul 10, 2024

cor3ntin Jul 11, 2024

jakubjelinek commented Jul 9, 2024

Fznamznon commented Jul 9, 2024

jakubjelinek commented Jul 9, 2024

AaronBallman commented Jul 9, 2024

AaronBallman commented Jul 9, 2024

cor3ntin Jul 11, 2024

Fznamznon Jul 12, 2024

cor3ntin Jul 11, 2024

Fznamznon commented Jul 15, 2024

jakubjelinek commented Jul 15, 2024

jakubjelinek commented Jul 16, 2024

Fznamznon commented Jul 16, 2024 •

edited

Loading

AaronBallman commented Jul 16, 2024

AaronBallman left a comment

ThePhD commented Jul 16, 2024

Fznamznon commented Jul 16, 2024

bnbarham Aug 6, 2024

AaronBallman Aug 6, 2024

Fznamznon Aug 6, 2024

bnbarham Aug 6, 2024 •

edited

Loading

		@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,

		// primary-expression
		case tok::numeric_constant:

		@@ -2123,17 +2123,18 @@ class Preprocessor {
		char
		getSpellingOfSingleCharacterNumericConstant(const Token &Tok,

[clang] Inject tokens containing #embed back into token stream #97274

[clang] Inject tokens containing #embed back into token stream #97274

Conversation

Fznamznon commented Jul 1, 2024

llvmbot commented Jul 1, 2024

github-actions bot commented Jul 1, 2024 • edited Loading

AaronBallman commented Jul 1, 2024

AaronBallman left a comment

Choose a reason for hiding this comment

Fznamznon commented Jul 8, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jakubjelinek commented Jul 9, 2024

Fznamznon commented Jul 9, 2024

jakubjelinek commented Jul 9, 2024

AaronBallman commented Jul 9, 2024

AaronBallman commented Jul 9, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Fznamznon commented Jul 15, 2024

jakubjelinek commented Jul 15, 2024

jakubjelinek commented Jul 16, 2024

Fznamznon commented Jul 16, 2024 • edited Loading

AaronBallman commented Jul 16, 2024

AaronBallman left a comment

Choose a reason for hiding this comment

ThePhD commented Jul 16, 2024

Fznamznon commented Jul 16, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

bnbarham Aug 6, 2024 • edited Loading

Choose a reason for hiding this comment

github-actions bot commented Jul 1, 2024 •

edited

Loading

Fznamznon commented Jul 8, 2024 •

edited

Loading

Fznamznon commented Jul 16, 2024 •

edited

Loading

bnbarham Aug 6, 2024 •

edited

Loading