-
Notifications
You must be signed in to change notification settings - Fork 12.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang] Inject tokens containing #embed back into token stream #97274
Conversation
Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.
@llvm/pr-subscribers-clang Author: Mariya Podchishchaeva (Fznamznon) ChangesInstead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas. Full diff: https://github.com/llvm/llvm-project/pull/97274.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 37d570ca5e75b..1bc9c59576f33 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -165,6 +165,9 @@ TOK(raw_identifier) // Used only in raw lexing mode.
// C99 6.4.4.2: Floating Constants
TOK(numeric_constant) // 0x123
+// Directly holds numerical value. Used to process C23 #embed.
+TOK(binary_data)
+
// C99 6.4.4: Character Constants
TOK(char_constant) // 'a'
TOK(wide_char_constant) // L'b'
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index e5183a27d2bc5..1b133dde89587 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -98,7 +98,7 @@ inline bool isLiteral(TokenKind K) {
return K == tok::numeric_constant || K == tok::char_constant ||
K == tok::wide_char_constant || K == tok::utf8_char_constant ||
K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
- isStringLiteral(K) || K == tok::header_name;
+ isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
}
/// Return true if this is any of tok::annot_* kinds.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index be3334b980746..8e30756da2a01 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2123,8 +2123,9 @@ class Preprocessor {
char
getSpellingOfSingleCharacterNumericConstant(const Token &Tok,
bool *Invalid = nullptr) const {
- assert(Tok.is(tok::numeric_constant) &&
- Tok.getLength() == 1 && "Called on unsupported token");
+ assert((Tok.is(tok::numeric_constant) || Tok.is(tok::binary_data)) &&
+ Tok.getLength() == 1 &&
+ "Called on unsupported token");
assert(!Tok.needsCleaning() && "Token can't need cleaning with length 1");
// If the token is carrying a literal data pointer, just use it.
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 6880fa4bb0b03..7bc2280764c5b 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -2123,7 +2123,7 @@ class Parser : public CodeCompletionHandler {
};
ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo);
ExprResult createEmbedExpr();
- void ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs);
+ void injectEmbedTokens();
//===--------------------------------------------------------------------===//
// clang Expressions
@@ -3830,7 +3830,6 @@ class Parser : public CodeCompletionHandler {
AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS,
ImplicitTypenameContext AllowImplicitTypename,
bool IsClassName = false);
- void ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs);
bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
TemplateTy Template, SourceLocation OpenLoc);
ParsedTemplateArgument ParseTemplateTemplateArgument();
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 9fc3cd73f73a0..a3b800a35b55e 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
// primary-expression
case tok::numeric_constant:
+ case tok::binary_data:
// constant: integer-constant
// constant: floating-constant
@@ -1067,18 +1068,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
}
case tok::annot_embed: {
- // We've met #embed in a context where a single value is expected. Take last
- // element from #embed data as if it were a comma expression.
- EmbedAnnotationData *Data =
- reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
- SourceLocation StartLoc = ConsumeAnnotationToken();
- ASTContext &Context = Actions.getASTContext();
- Res = IntegerLiteral::Create(Context,
- llvm::APInt(CHAR_BIT, Data->BinaryData.back()),
- Context.UnsignedCharTy, StartLoc);
- if (Data->BinaryData.size() > 1)
- Diag(StartLoc, diag::warn_unused_comma_left_operand);
- break;
+ injectEmbedTokens();
+ return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+ isVectorLiteral, NotPrimaryExpression);
}
case tok::kw___super:
@@ -3578,15 +3570,29 @@ ExprResult Parser::ParseFoldExpression(ExprResult LHS,
T.getCloseLocation());
}
-void Parser::ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs) {
+void Parser::injectEmbedTokens() {
EmbedAnnotationData *Data =
reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
- SourceLocation StartLoc = ConsumeAnnotationToken();
- ASTContext &Context = Actions.getASTContext();
- for (auto Byte : Data->BinaryData) {
- Exprs.push_back(IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
- Context.UnsignedCharTy, StartLoc));
+ MutableArrayRef<Token> Toks(
+ PP.getPreprocessorAllocator().Allocate<Token>(Data->BinaryData.size() * 2 - 1),
+ Data->BinaryData.size() * 2 - 1);
+ unsigned I = 0;
+ for (auto &Byte : Data->BinaryData) {
+ Toks[I].startToken();
+ Toks[I].setKind(tok::binary_data);
+ Toks[I].setLocation(Tok.getLocation());
+ Toks[I].setLength(1);
+ Toks[I].setLiteralData(&Byte);
+ if (I != ((Data->BinaryData.size() - 1) * 2)) {
+ Toks[I + 1].startToken();
+ Toks[I + 1].setKind(tok::comma);
+ Toks[I + 1].setLocation(Tok.getLocation());
+ }
+ I += 2;
}
+ PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true,
+ /*IsReinject=*/false);
+ ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
}
/// ParseExpressionList - Used for C/C++ (argument-)expression-list.
@@ -3624,17 +3630,8 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) {
Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists);
Expr = ParseBraceInitializer();
- } else if (Tok.is(tok::annot_embed)) {
- ExpandEmbedDirective(Exprs);
- if (Tok.isNot(tok::comma))
- break;
- Token Comma = Tok;
- ConsumeToken();
- checkPotentialAngleBracketDelimiter(Comma);
- continue;
- } else {
+ } else
Expr = ParseAssignmentExpression();
- }
if (EarlyTypoCorrection)
Expr = Actions.CorrectDelayedTyposInExpr(Expr);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index 7e30afa2c64a4..a5130f56600e5 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -1523,19 +1523,6 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() {
ExprArg.get(), Loc);
}
-void Parser::ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs) {
- EmbedAnnotationData *Data =
- reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
- SourceLocation StartLoc = ConsumeAnnotationToken();
- ASTContext &Context = Actions.getASTContext();
- for (auto Byte : Data->BinaryData) {
- Expr *E = IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
- Context.UnsignedCharTy, StartLoc);
- TemplateArgs.push_back(
- ParsedTemplateArgument(ParsedTemplateArgument::NonType, E, StartLoc));
- }
-}
-
/// ParseTemplateArgumentList - Parse a C++ template-argument-list
/// (C++ [temp.names]). Returns true if there was an error.
///
@@ -1560,24 +1547,20 @@ bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
do {
PreferredType.enterFunctionArgument(Tok.getLocation(), RunSignatureHelp);
- if (Tok.is(tok::annot_embed)) {
- ExpandEmbedIntoTemplateArgList(TemplateArgs);
- } else {
- ParsedTemplateArgument Arg = ParseTemplateArgument();
- SourceLocation EllipsisLoc;
- if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
- Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
-
- if (Arg.isInvalid()) {
- if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
- RunSignatureHelp();
- return true;
- }
-
- // Save this template argument.
- TemplateArgs.push_back(Arg);
+ ParsedTemplateArgument Arg = ParseTemplateArgument();
+ SourceLocation EllipsisLoc;
+ if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
+ Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
+
+ if (Arg.isInvalid()) {
+ if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
+ RunSignatureHelp();
+ return true;
}
+ // Save this template argument.
+ TemplateArgs.push_back(Arg);
+
// If the next token is a comma, consume it and keep reading
// arguments.
} while (TryConsumeToken(tok::comma));
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index db44cfe1288b6..c5657b2389cd2 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3722,9 +3722,11 @@ bool Sema::CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero) {
ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
// Fast path for a single digit (which is quite common). A single digit
// cannot have a trigraph, escaped newline, radix prefix, or suffix.
- if (Tok.getLength() == 1) {
+ if (Tok.getLength() == 1 || Tok.getKind() == tok::binary_data) {
const char Val = PP.getSpellingOfSingleCharacterNumericConstant(Tok);
- return ActOnIntegerConstant(Tok.getLocation(), Val-'0');
+ return ActOnIntegerConstant(
+ Tok.getLocation(),
+ (Tok.getKind() == tok::binary_data) ? Val : Val - '0');
}
SmallString<128> SpellingBuffer;
diff --git a/clang/test/Preprocessor/embed_codegen.cpp b/clang/test/Preprocessor/embed_codegen.cpp
index 64110afc162d7..201bf300bc669 100644
--- a/clang/test/Preprocessor/embed_codegen.cpp
+++ b/clang/test/Preprocessor/embed_codegen.cpp
@@ -43,8 +43,9 @@ a
};
// CHECK: store i32 107, ptr %b, align 4
-int b =
+int b = (
#embed<jk.txt>
+ )
;
diff --git a/clang/test/Preprocessor/embed_constexpr.cpp b/clang/test/Preprocessor/embed_constexpr.cpp
index 1cadff76b4890..a7857641a2e8d 100644
--- a/clang/test/Preprocessor/embed_constexpr.cpp
+++ b/clang/test/Preprocessor/embed_constexpr.cpp
@@ -1,5 +1,6 @@
// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -Wno-c23-extensions
// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -fexperimental-new-constant-interpreter -Wno-c23-extensions
+// expected-no-diagnostics
constexpr int value(int a, int b) {
return a + b;
@@ -46,7 +47,7 @@ int array[
static_assert(sizeof(array) / sizeof(int) == 'j');
constexpr int comma_expr = (
-#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
+#embed <jk.txt>
);
static_assert(comma_expr == 'k');
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index 31b622c848d6a..cc73a88e5a657 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -27,7 +27,7 @@ _Static_assert(
_Static_assert(sizeof(
#embed <single_byte.txt>
) ==
-sizeof(unsigned char)
+sizeof(int)
, ""
);
_Static_assert(sizeof
@@ -35,9 +35,9 @@ _Static_assert(sizeof
, ""
);
_Static_assert(sizeof(
-#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
+#embed <jk.txt>
) ==
-sizeof(unsigned char)
+sizeof(int)
, ""
);
@@ -73,10 +73,10 @@ void do_stuff() {
// Ensure that we don't accidentally allow you to initialize an unsigned char *
// from embedded data; the data is modeled as a string literal internally, but
// is not actually a string literal.
-const unsigned char *ptr =
+const unsigned char *ptr = (
#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
-; // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'unsigned char'}} \
- cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'unsigned char'}}
+ ); // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'int'}} \
+ cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'int'}}
// However, there are some cases where this is fine and should work.
const unsigned char *null_ptr_1 =
@@ -101,11 +101,10 @@ constexpr unsigned char ch =
;
static_assert(ch == 0);
-void foobar(float x, char y, char z); // cxx-note {{candidate function not viable: requires 3 arguments, but 1 was provided}}
- // c-note@-1 {{declared here}}
-void g1() { foobar((float) // cxx-error {{no matching function for call to 'foobar'}}
-#embed "numbers.txt" limit(3) // expected-warning {{left operand of comma operator has no effect}}
-); // c-error {{too few arguments to function call, expected 3, have 1}}
+void foobar(float x, char y, char z);
+void g1() { foobar((float)
+#embed "numbers.txt" limit(3)
+);
}
#if __cplusplus
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
CC @jakubjelinek for awareness |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In general, I like this! LGTM, though please give others a chance to review before landing.
It's been a week. Are we good to go? @cor3ntin ? |
@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, | |||
|
|||
// primary-expression | |||
case tok::numeric_constant: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you adapt the couple other places where we parse a numeric constant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please elaborate which other places you meant? I briefly examined all mentions of tok::numeric_constant
and ActOnNumericConstant
in lib/Parse
and modifying any of these doesn't seem to be reasonable for #embed
support.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping @cor3ntin
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These cases
[[clang::availability(
#embed "smth.txt"
)]] void f();
struct S {
virtual void f() =
#embed "zero.bin"
;
};
However, it's a bit ambiguous whether these should be supported at all in the proposed C++ wording for embed. I sent a mail to the C++ committee
It's probably fine to ignore that for now, the patch is a great improvement as it is
I'd like to point out that most of the testcases I wrote for the GCC implementation still fail with the latest clang on godbolt. |
This patch should help with at least some of them. |
Maybe, but I've add quite a few new testcases too... |
It'd be good to verify that it does help with them or see if there are other issues we're not yet tracking (but please do not copy the tests from the GCC implementation into Clang because of licensing). |
Thank you for letting us know! The plan is to continue to polish this feature through the Clang 19 release. The branch date for us is in two weeks, so the more we can get done before the branch, the better. |
clang/lib/Sema/SemaExpr.cpp
Outdated
return ActOnIntegerConstant(Tok.getLocation(), Val-'0'); | ||
return ActOnIntegerConstant( | ||
Tok.getLocation(), | ||
(Tok.getKind() == tok::binary_data) ? Val : Val - '0'); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we move that to PP.getSpellingOfSingleCharacterNumericConstant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure.
@@ -1018,6 +1018,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, | |||
|
|||
// primary-expression | |||
case tok::numeric_constant: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These cases
[[clang::availability(
#embed "smth.txt"
)]] void f();
struct S {
virtual void f() =
#embed "zero.bin"
;
};
However, it's a bit ambiguous whether these should be supported at all in the proposed C++ wording for embed. I sent a mail to the C++ committee
It's probably fine to ignore that for now, the patch is a great improvement as it is
I'm looking through https://gcc.gnu.org/pipermail/gcc-patches/2024-June/655012.html
And also cases like
as intended, the test case similar to latter is modified by the patch. |
I meant also e.g. basic tests like gcc/testsuite/c-c++-common/cpp/embed-20.c in |
@ThePhD @AaronBallman int a = sizeof (
#embed __FILE__ limit (1)
); is 1 in clang as well as clang++ trunk and 4 with clang/clang++ trunk with -save-temps. 12,143,12,16 or static_cast<unsigned char>(12),static_cast<unsigned char>(143),static_cast<unsigned char>(12),static_cast<unsigned char>(16) or (unsigned char)12,(unsigned char)143,(unsigned char)12,(unsigned char)16 or '\014','\0217','\014','\020' or whatever else and the preprocessor would need to emit that cast on every literal (unless using some extension like #embed "." gnu::base64("...") that the GCC patchset uses for the inner parts of the longer sequences. extern long long p[];
auto foo () {
return
#embed __FILE__ limit (1)
[p];
} will behave one way with the cast is (unsigned char)12 and differently if it is static_cast(12) or ((unsigned char)12). |
Hmm, ok. #99023 . Thanks for pointing this out. I'm looking into other tests as well. My point was that this particular PR solves its task and needs to proceed. |
That's correct; this patch gets started on addressing that issue by updating the way we expand the embedded contents but isn't intended to fix the type issue yet (that will be a follow-up patch). When we do the follow-up, we should ensure the embedded elements are of type |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
Just to be clear, I'm editing the C++ wording to also produce integer constant expressions that should be |
FYI #99050 |
…97274) Summary: Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D59822402
Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.
@@ -2123,17 +2123,18 @@ class Preprocessor { | |||
char | |||
getSpellingOfSingleCharacterNumericConstant(const Token &Tok, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both the documentation and name of this function is incorrect now that the underlying functionality has changed - would you mind updating it to something more appropriate @Fznamznon? It used to return the character itself:
/// Given a Token \p Tok that is a numeric constant with length 1,
/// return the character.
But now returns the value of the character.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've updated the comment and the interface in 295e4f4
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you @AaronBallman
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @AaronBallman! Would it be worth also changing Spelling
to Value
or something like that?
These changes were inspired by a post-commit review comment: #97274 (review)
The functionality of this function changed upstream in llvm/llvm-project#97274 and it now returns the value of the character, rather than the character itself.
These changes were inspired by a post-commit review comment: llvm#97274 (review)
These changes were inspired by a post-commit review comment: llvm#97274 (review)
This patch adds similar optimizations to the C++ FE as have been implemented earlier in the C FE. The libcpp hunk enables use of CPP_EMBED token even for C++, not just C; the preprocessor guarantees there is always a CPP_NUMBER CPP_COMMA before CPP_EMBED and CPP_COMMA CPP_NUMBER after it which simplifies parsing (unless #embed is more than 2GB, in that case it could be CPP_NUMBER CPP_COMMA CPP_EMBED CPP_COMMA CPP_EMBED CPP_COMMA CPP_EMBED CPP_COMMA CPP_NUMBER etc. with each CPP_EMBED covering at most INT_MAX bytes). Similarly to the C patch, this patch parses it into RAW_DATA_CST tree in the braced initializers (and from there peels into INTEGER_CSTs unless it is an initializer of an std::byte array or integral array with CHAR_BIT element precision), parses CPP_EMBED in cp_parser_expression into just the last INTEGER_CST in it because I think users don't need millions of -Wunused-value warnings because they did useless int a = ( #embed "megabyte.dat" ); and so most of the inner INTEGER_CSTs would be there just for the warning, and in the rest of contexts like template argument list, function argument list, attribute argument list, ...) parse it into a sequence of INTEGER_CSTs (I wrote a range/iterator classes to simplify that). My dumb cat embed-11.c constexpr unsigned char a[] = { #embed "cc1plus" }; const unsigned char *b = a; testcase where cc1plus is 492329008 bytes long when configured --enable-checking=yes,rtl,extra against recent binutils with .base64 gas support results in: time ./xg++ -B ./ -S -O2 embed-11.c real 0m4.350s user 0m2.427s sys 0m0.830s time ./xg++ -B ./ -c -O2 embed-11.c real 0m6.932s user 0m6.034s sys 0m0.888s (compared to running out of memory or very long compilation). On a shorter inclusion, cat embed-12.c constexpr unsigned char a[] = { #embed "xg++" }; const unsigned char *b = a; where xg++ is 15225904 bytes long, this takes using GCC with the #embed patchset except for this patch: time ~/src/gcc/obj36/gcc/xg++ -B ~/src/gcc/obj36/gcc/ -S -O2 embed-12.c real 0m33.190s user 0m32.327s sys 0m0.790s and with this patch: time ./xg++ -B ./ -S -O2 embed-12.c real 0m0.118s user 0m0.090s sys 0m0.028s The patch doesn't change anything on what the first patch in the series introduces even for C++, namely that #embed is expanded (actually or as if) into a sequence of literals like 127,69,76,70,2,1,1,3,0,0,0,0,0,0,0,0,2,0,62,0,1,0,0,0,80,211,64,0,0,0,0,0,64,0,0,0,0,0,0,0,8,253 and so each element has int type. That is how I believe it is in C23, and the different versions of the C++ P1967 paper specified there some casts, P1967R12 in particular "Otherwise, the integral constant expression is the value of std::fgetc’s return is cast to unsigned char." but please see llvm/llvm-project#97274 (comment) comment and whether we really want the preprocessor to preprocess it for C++ as (or as-if) static_cast<unsigned char>(127),static_cast<unsigned char>(69),static_cast<unsigned char>(76),static_cast<unsigned char>(70),static_cast<unsigned char>(2),... i.e. 9 tokens per byte rather than 2, or (unsigned char)127,(unsigned char)69,... or ((unsigned char)127),((unsigned char)69),... etc. Without a literal suffix for unsigned char constant literals it is horrible, plus the incompatibility between C and C++. Sure, we could use the magic form more often for C++ to save the size and do the 9 or how many tokens form only for the boundary constants and use #embed "." __gnu__::__base64__("...") for what is in between if there are at least 2 tokens inside of it. E.g. (unsigned char)127 vs. static_cast<unsigned char>(127) behaves differently if there is constexpr long long p[] = { ... }; ... #embed __FILE__ [p] 2024-12-06 Jakub Jelinek <jakub@redhat.com> libcpp/ * files.cc (finish_embed): Use CPP_EMBED even for C++. gcc/ * tree.h (RAW_DATA_UCHAR_ELT, RAW_DATA_SCHAR_ELT): Define. gcc/cp/ChangeLog: * cp-tree.h (class raw_data_iterator): New type. (class raw_data_range): New type. * parser.cc (cp_parser_postfix_open_square_expression): Handle parsing of CPP_EMBED. (cp_parser_parenthesized_expression_list): Likewise. Use cp_lexer_next_token_is. (cp_parser_expression): Handle parsing of CPP_EMBED. (cp_parser_template_argument_list): Likewise. (cp_parser_initializer_list): Likewise. (cp_parser_oacc_clause_tile): Likewise. (cp_parser_omp_tile_sizes): Likewise. * pt.cc (tsubst_expr): Handle RAW_DATA_CST. * constexpr.cc (reduced_constant_expression_p): Likewise. (raw_data_cst_elt): New function. (find_array_ctor_elt): Handle RAW_DATA_CST. (cxx_eval_array_reference): Likewise. * typeck2.cc (digest_init_r): Emit -Wnarrowing and/or -Wconversion diagnostics. (process_init_constructor_array): Handle RAW_DATA_CST. * decl.cc (maybe_deduce_size_from_array_init): Likewise. (is_direct_enum_init): Fail for RAW_DATA_CST. (cp_maybe_split_raw_data): New function. (consume_init): New function. (reshape_init_array_1): Add VECTOR_P argument. Handle RAW_DATA_CST. (reshape_init_array): Adjust reshape_init_array_1 caller. (reshape_init_vector): Likewise. (reshape_init_class): Handle RAW_DATA_CST. (reshape_init_r): Likewise. gcc/testsuite/ * c-c++-common/cpp/embed-22.c: New test. * c-c++-common/cpp/embed-23.c: New test. * g++.dg/cpp/embed-4.C: New test. * g++.dg/cpp/embed-5.C: New test. * g++.dg/cpp/embed-6.C: New test. * g++.dg/cpp/embed-7.C: New test. * g++.dg/cpp/embed-8.C: New test. * g++.dg/cpp/embed-9.C: New test. * g++.dg/cpp/embed-10.C: New test. * g++.dg/cpp/embed-11.C: New test. * g++.dg/cpp/embed-12.C: New test. * g++.dg/cpp/embed-13.C: New test. * g++.dg/cpp/embed-14.C: New test.
Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.