Skip to content

Commit 2b8c692

Browse files
committed
[Windows] Convert from UTF-8 to UTF-16 when writing to a Windows console
Summary: Calling WriteConsoleW is the most reliable way to print Unicode characters to a Windows console. If binary data gets printed to the console, attempting to re-encode it shouldn't be a problem, since garbage in can produce garbage out. This breaks printing strings in the local codepage, which WriteConsoleA knows how to handle. For example, this can happen when user source code is encoded with the local codepage, and an LLVM tool quotes it while emitting a caret diagnostic. This is unfortunate, but well-behaved tools should validate that their input is UTF-8 and escape non-UTF-8 characters before sending them to raw_fd_ostream. Clang already does this, but not all LLVM tools do this. One drawback to the current implementation is printing a string a byte at a time doesn't work. Consider this LLVM code: for (char C : MyStr) outs() << C; Because outs() is now unbuffered, we wil try to convert each byte to UTF-16, which will fail. However, this already didn't work, so I think we may as well update callers that do that as we find them to print complete portions of strings. You can see a real example of this in my patch to SourceMgr.cpp Fixes PR38669 and PR36267. Reviewers: zturner, efriedma Subscribers: llvm-commits, hiraditya Differential Revision: https://reviews.llvm.org/D51558 llvm-svn: 341433
1 parent 2768b52 commit 2b8c692

File tree

4 files changed

+92
-27
lines changed

4 files changed

+92
-27
lines changed

llvm/include/llvm/Support/raw_ostream.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,12 +367,16 @@ class raw_fd_ostream : public raw_pwrite_stream {
367367
int FD;
368368
bool ShouldClose;
369369

370+
bool SupportsSeeking;
371+
372+
/// True if this fd refers to a Windows console device. Mintty and other
373+
/// terminal emulators are TTYs, but they are not consoles.
374+
bool IsWindowsConsole = false;
375+
370376
std::error_code EC;
371377

372378
uint64_t pos;
373379

374-
bool SupportsSeeking;
375-
376380
/// See raw_ostream::write_impl.
377381
void write_impl(const char *Ptr, size_t Size) override;
378382

llvm/lib/Support/Locale.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,11 @@ namespace sys {
77
namespace locale {
88

99
int columnWidth(StringRef Text) {
10-
#ifdef _WIN32
11-
return Text.size();
12-
#else
1310
return llvm::sys::unicode::columnWidthUTF8(Text);
14-
#endif
1511
}
1612

1713
bool isPrint(int UCS) {
18-
#ifdef _WIN32
19-
// Restrict characters that we'll try to print to the lower part of ASCII
20-
// except for the control characters (0x20 - 0x7E). In general one can not
21-
// reliably output code points U+0080 and higher using narrow character C/C++
22-
// output functions in Windows, because the meaning of the upper 128 codes is
23-
// determined by the active code page in the console.
24-
return ' ' <= UCS && UCS <= '~';
25-
#else
2614
return llvm::sys::unicode::isPrintable(UCS);
27-
#endif
2815
}
2916

3017
} // namespace locale

llvm/lib/Support/SourceMgr.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -345,12 +345,18 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
345345
static void printSourceLine(raw_ostream &S, StringRef LineContents) {
346346
// Print out the source line one character at a time, so we can expand tabs.
347347
for (unsigned i = 0, e = LineContents.size(), OutCol = 0; i != e; ++i) {
348-
if (LineContents[i] != '\t') {
349-
S << LineContents[i];
350-
++OutCol;
351-
continue;
348+
size_t NextTab = LineContents.find('\t', i);
349+
// If there were no tabs left, print the rest, we are done.
350+
if (NextTab == StringRef::npos) {
351+
S << LineContents.drop_front(i);
352+
break;
352353
}
353354

355+
// Otherwise, print from i to NextTab.
356+
S << LineContents.slice(i, NextTab);
357+
OutCol += NextTab - i;
358+
i = NextTab;
359+
354360
// If we have a tab, emit at least one space, then round up to 8 columns.
355361
do {
356362
S << ' ';

llvm/lib/Support/raw_ostream.cpp

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
#endif
6161

6262
#ifdef _WIN32
63+
#include "llvm/Support/ConvertUTF.h"
6364
#include "Windows/WindowsSupport.h"
6465
#endif
6566

@@ -567,6 +568,12 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
567568
if (FD <= STDERR_FILENO)
568569
ShouldClose = false;
569570

571+
#ifdef _WIN32
572+
// Check if this is a console device. This is not equivalent to isatty.
573+
IsWindowsConsole =
574+
::GetFileType((HANDLE)::_get_osfhandle(fd)) == FILE_TYPE_CHAR;
575+
#endif
576+
570577
// Get the starting position.
571578
off_t loc = ::lseek(FD, 0, SEEK_CUR);
572579
#ifdef _WIN32
@@ -609,10 +616,68 @@ raw_fd_ostream::~raw_fd_ostream() {
609616
/*GenCrashDiag=*/false);
610617
}
611618

619+
#if defined(_WIN32)
620+
// The most reliable way to print unicode in a Windows console is with
621+
// WriteConsoleW. To use that, first transcode from UTF-8 to UTF-16. This
622+
// assumes that LLVM programs always print valid UTF-8 to the console. The data
623+
// might not be UTF-8 for two major reasons:
624+
// 1. The program is printing binary (-filetype=obj -o -), in which case it
625+
// would have been gibberish anyway.
626+
// 2. The program is printing text in a semi-ascii compatible codepage like
627+
// shift-jis or cp1252.
628+
//
629+
// Most LLVM programs don't produce non-ascii text unless they are quoting
630+
// user source input. A well-behaved LLVM program should either validate that
631+
// the input is UTF-8 or transcode from the local codepage to UTF-8 before
632+
// quoting it. If they don't, this may mess up the encoding, but this is still
633+
// probably the best compromise we can make.
634+
static bool write_console_impl(int FD, StringRef Data) {
635+
SmallVector<wchar_t, 256> WideText;
636+
637+
// Fall back to ::write if it wasn't valid UTF-8.
638+
if (auto EC = sys::windows::UTF8ToUTF16(Data, WideText))
639+
return false;
640+
641+
// On Windows 7 and earlier, WriteConsoleW has a low maximum amount of data
642+
// that can be written to the console at a time.
643+
size_t MaxWriteSize = WideText.size();
644+
if (!RunningWindows8OrGreater())
645+
MaxWriteSize = 32767;
646+
647+
size_t WCharsWritten = 0;
648+
do {
649+
size_t WCharsToWrite =
650+
std::min(MaxWriteSize, WideText.size() - WCharsWritten);
651+
DWORD ActuallyWritten;
652+
bool Success =
653+
::WriteConsoleW((HANDLE)::_get_osfhandle(FD), &WideText[WCharsWritten],
654+
WCharsToWrite, &ActuallyWritten,
655+
/*Reserved=*/nullptr);
656+
657+
// The most likely reason for WriteConsoleW to fail is that FD no longer
658+
// points to a console. Fall back to ::write. If this isn't the first loop
659+
// iteration, something is truly wrong.
660+
if (!Success)
661+
return false;
662+
663+
WCharsWritten += ActuallyWritten;
664+
} while (WCharsWritten != WideText.size());
665+
return true;
666+
}
667+
#endif
668+
612669
void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
613670
assert(FD >= 0 && "File already closed.");
614671
pos += Size;
615672

673+
#if defined(_WIN32)
674+
// If this is a Windows console device, try re-encoding from UTF-8 to UTF-16
675+
// and using WriteConsoleW. If that fails, fall back to plain write().
676+
if (IsWindowsConsole)
677+
if (write_console_impl(FD, StringRef(Ptr, Size)))
678+
return;
679+
#endif
680+
616681
// The maximum write size is limited to INT32_MAX. A write
617682
// greater than SSIZE_MAX is implementation-defined in POSIX,
618683
// and Windows _write requires 32 bit input.
@@ -622,12 +687,6 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
622687
// It is observed that Linux returns EINVAL for a very large write (>2G).
623688
// Make it a reasonably small value.
624689
MaxWriteSize = 1024 * 1024 * 1024;
625-
#elif defined(_WIN32)
626-
// Writing a large size of output to Windows console returns ENOMEM. It seems
627-
// that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
628-
// the latter has a size limit (66000 bytes or less, depending on heap usage).
629-
if (::_isatty(FD) && !RunningWindows8OrGreater())
630-
MaxWriteSize = 32767;
631690
#endif
632691

633692
do {
@@ -696,8 +755,17 @@ void raw_fd_ostream::pwrite_impl(const char *Ptr, size_t Size,
696755
}
697756

698757
size_t raw_fd_ostream::preferred_buffer_size() const {
699-
#if !defined(_MSC_VER) && !defined(__MINGW32__) && !defined(__minix)
700-
// Windows and Minix have no st_blksize.
758+
#if defined(_WIN32)
759+
// Disable buffering for console devices. Console output is re-encoded from
760+
// UTF-8 to UTF-16 on Windows, and buffering it would require us to split the
761+
// buffer on a valid UTF-8 codepoint boundary. Terminal buffering is disabled
762+
// below on most other OSs, so do the same thing on Windows and avoid that
763+
// complexity.
764+
if (IsWindowsConsole)
765+
return 0;
766+
return raw_ostream::preferred_buffer_size();
767+
#elif !defined(__minix)
768+
// Minix has no st_blksize.
701769
assert(FD >= 0 && "File not yet open!");
702770
struct stat statbuf;
703771
if (fstat(FD, &statbuf) != 0)

0 commit comments

Comments
 (0)