Skip to content

Commit 4d655b0

Browse files
committed
Fix Unicode crashes redirected cout
1 parent bf96760 commit 4d655b0

File tree

1 file changed

+57
-4
lines changed

1 file changed

+57
-4
lines changed

Diff for: include/pybind11/iostream.h

+57-4
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
#include <string>
1717
#include <memory>
1818
#include <iostream>
19+
#include <cstring>
20+
#include <iterator>
21+
#include <algorithm>
1922

2023
PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
2124
PYBIND11_NAMESPACE_BEGIN(detail)
@@ -38,6 +41,47 @@ class pythonbuf : public std::streambuf {
3841
return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
3942
}
4043

44+
// Computes how many bytes at the end of the buffer are part of an
45+
// incomplete sequence of UTF-8 bytes.
46+
// Precondition: pbase() < pptr()
47+
size_t utf8_remainder() const {
48+
const auto rbase = std::reverse_iterator<char *>(pbase());
49+
const auto rpptr = std::reverse_iterator<char *>(pptr());
50+
auto is_ascii = [](char c) {
51+
return (static_cast<unsigned char>(c) & 0x80) == 0x00;
52+
};
53+
auto is_leading = [](char c) {
54+
return (static_cast<unsigned char>(c) & 0xC0) == 0xC0;
55+
};
56+
auto is_leading_2b = [](char c) {
57+
return static_cast<unsigned char>(c) <= 0xDF;
58+
};
59+
auto is_leading_3b = [](char c) {
60+
return static_cast<unsigned char>(c) <= 0xEF;
61+
};
62+
// If the last character is ASCII, there are no incomplete code points
63+
if (is_ascii(*rpptr))
64+
return 0;
65+
// Otherwise, work back from the end of the buffer and find the first
66+
// UTF-8 leading byte
67+
const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase;
68+
const auto leading = std::find_if(rpptr, rpend, is_leading);
69+
const auto dist = static_cast<size_t>(leading - rpptr);
70+
size_t remainder = 0;
71+
72+
if (dist == 0)
73+
remainder = 1; // 1-byte code point is impossible
74+
else if (dist == 1)
75+
remainder = is_leading_2b(*leading) ? 0 : dist + 1;
76+
else if (dist == 2)
77+
remainder = is_leading_3b(*leading) ? 0 : dist + 1;
78+
// else if (dist >= 3), at least 4 bytes before encountering an UTF-8
79+
// leading byte, either no remainder or invalid UTF-8.
80+
// Invalid UTF-8 will cause an exception later when converting
81+
// to a Python string, so that's not handled here.
82+
return remainder;
83+
}
84+
4185
// This function must be non-virtual to be called in a destructor. If the
4286
// rare MSVC test failure shows up with this version, then this should be
4387
// simplified to a fully qualified call.
@@ -48,13 +92,22 @@ class pythonbuf : public std::streambuf {
4892
gil_scoped_acquire tmp;
4993

5094
// This subtraction cannot be negative, so dropping the sign.
51-
str line(pbase(), static_cast<size_t>(pptr() - pbase()));
95+
auto size = static_cast<size_t>(pptr() - pbase());
96+
size_t remainder = utf8_remainder();
97+
98+
if (size > remainder) {
99+
str line(pbase(), size - remainder);
100+
pywrite(line);
101+
pyflush();
102+
}
52103

53-
pywrite(line);
54-
pyflush();
104+
// Placed inside gil_scoped_aquire as a mutex to avoid a race.
55105

56-
// Placed inside gil_scoped_aquire as a mutex to avoid a race
106+
// Copy the remainder at the end of the buffer to the beginning:
107+
if (remainder > 0)
108+
std::memmove(pbase(), pptr() - remainder, remainder);
57109
setp(pbase(), epptr());
110+
pbump(static_cast<int>(remainder));
58111
}
59112

60113
}

0 commit comments

Comments
 (0)