Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pystring: disable PyString::data on big-endian targets #1834

Merged
merged 1 commit into from
Aug 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- `PySequence::get_slice` now returns `PyResult<&PySequence>` instead of `PyResult<&PyAny>`. [#1829](https://github.com/PyO3/pyo3/pull/1829)
- Deprecate `PyTuple::split_from`. [#1804](https://github.com/PyO3/pyo3/pull/1804)
- Deprecate `PyTuple::slice`, new method `PyTuple::get_slice` added with `usize` indices. [#1828](https://github.com/PyO3/pyo3/pull/1828)
- Mark `PyString::data` as `unsafe` and disable it and some supporting PyUnicode FFI APIs (which depend on a C bitfield) on big-endian targets. [#1834](https://github.com/PyO3/pyo3/pull/1834)

## [0.14.3] - 2021-08-22

Expand Down
68 changes: 44 additions & 24 deletions src/ffi/cpython/unicodeobject.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,34 @@ pub struct PyASCIIObject {
pub wstr: *mut wchar_t,
}

/// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe.
///
/// In addition, they are disabled on big-endian architectures to restrict this to most "common"
/// platforms, which are at least tested on CI and appear to be sound.
#[cfg(not(target_endian = "big"))]
impl PyASCIIObject {
#[inline]
pub fn interned(&self) -> c_uint {
pub unsafe fn interned(&self) -> c_uint {
self.state & 3
}

#[inline]
pub fn kind(&self) -> c_uint {
pub unsafe fn kind(&self) -> c_uint {
(self.state >> 2) & 7
}

#[inline]
pub fn compact(&self) -> c_uint {
pub unsafe fn compact(&self) -> c_uint {
(self.state >> 5) & 1
}

#[inline]
pub fn ascii(&self) -> c_uint {
pub unsafe fn ascii(&self) -> c_uint {
(self.state >> 6) & 1
}

#[inline]
pub fn ready(&self) -> c_uint {
pub unsafe fn ready(&self) -> c_uint {
(self.state >> 7) & 1
}
}
Expand Down Expand Up @@ -114,6 +119,7 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1;
pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2;

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
debug_assert!(PyUnicode_Check(op) != 0);
debug_assert!(PyUnicode_IS_READY(op) != 0);
Expand All @@ -122,11 +128,13 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint {
(*(op as *mut PyASCIIObject)).compact()
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint {
if (*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0 {
1
Expand All @@ -144,21 +152,25 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2;
pub const PyUnicode_4BYTE_KIND: c_uint = 4;

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 {
PyUnicode_DATA(op) as *mut Py_UCS1
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 {
PyUnicode_DATA(op) as *mut Py_UCS2
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 {
PyUnicode_DATA(op) as *mut Py_UCS4
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
debug_assert!(PyUnicode_Check(op) != 0);
debug_assert!(PyUnicode_IS_READY(op) != 0);
Expand All @@ -167,6 +179,7 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
if PyUnicode_IS_ASCII(op) != 0 {
(op as *mut PyASCIIObject).offset(1) as *mut c_void
Expand All @@ -176,13 +189,15 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void {
debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null());

(*(op as *mut PyUnicodeObject)).data.any
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
debug_assert!(PyUnicode_Check(op) != 0);

Expand All @@ -206,13 +221,15 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
}

#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint {
(*(op as *mut PyASCIIObject)).ready()
}

#[cfg(not(Py_3_12))]
#[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))]
#[inline]
#[cfg(not(target_endian = "big"))]
pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int {
debug_assert!(PyUnicode_Check(op) != 0);

Expand Down Expand Up @@ -481,6 +498,7 @@ extern "C" {
// skipped _PyUnicode_ScanIdentifier

#[cfg(test)]
#[cfg(not(target_endian = "big"))]
mod tests {
use super::*;
use crate::types::PyString;
Expand All @@ -498,30 +516,32 @@ mod tests {
wstr: std::ptr::null_mut() as *mut wchar_t,
};

assert_eq!(o.interned(), 0);
assert_eq!(o.kind(), 0);
assert_eq!(o.compact(), 0);
assert_eq!(o.ascii(), 0);
assert_eq!(o.ready(), 0);
unsafe {
assert_eq!(o.interned(), 0);
assert_eq!(o.kind(), 0);
assert_eq!(o.compact(), 0);
assert_eq!(o.ascii(), 0);
assert_eq!(o.ready(), 0);

for i in 0..4 {
o.state = i;
assert_eq!(o.interned(), i);
}
for i in 0..4 {
o.state = i;
assert_eq!(o.interned(), i);
}

for i in 0..8 {
o.state = i << 2;
assert_eq!(o.kind(), i);
}
for i in 0..8 {
o.state = i << 2;
assert_eq!(o.kind(), i);
}

o.state = 1 << 5;
assert_eq!(o.compact(), 1);
o.state = 1 << 5;
assert_eq!(o.compact(), 1);

o.state = 1 << 6;
assert_eq!(o.ascii(), 1);
o.state = 1 << 6;
assert_eq!(o.ascii(), 1);

o.state = 1 << 7;
assert_eq!(o.ready(), 1);
o.state = 1 << 7;
assert_eq!(o.ready(), 1);
}
}

#[test]
Expand Down
81 changes: 48 additions & 33 deletions src/types/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ use std::str;
///
/// Python internally stores strings in various representations. This enumeration
/// represents those variations.
#[cfg(not(Py_LIMITED_API))]
#[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
#[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))]
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum PyStringData<'a> {
/// UCS1 representation.
Expand All @@ -31,7 +31,7 @@ pub enum PyStringData<'a> {
Ucs4(&'a [u32]),
}

#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
impl<'a> PyStringData<'a> {
/// Obtain the raw bytes backing this instance as a [u8] slice.
pub fn as_bytes(&self) -> &[u8] {
Expand Down Expand Up @@ -211,16 +211,28 @@ impl PyString {

/// Obtains the raw data backing the Python string.
///
/// If the Python string object was created through legacy APIs, its internal
/// storage format will be canonicalized before data is returned.
#[cfg(not(Py_LIMITED_API))]
#[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))]
pub fn data(&self) -> PyResult<PyStringData<'_>> {
/// If the Python string object was created through legacy APIs, its internal storage format
/// will be canonicalized before data is returned.
///
/// # Safety
///
/// This function implementation relies on manually decoding a C bitfield. In practice, this
/// works well on common little-endian architectures such as x86_64, where the bitfield has a
/// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
/// x86_64 platforms.
///
/// By using this API, you accept responsibility for testing that PyStringData behaves as
/// expected on the targets where you plan to distribute your software.
///
/// For example, it is known not to work on big-endian platforms.
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
#[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))]
pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
let ptr = self.as_ptr();

if cfg!(not(Py_3_12)) {
#[allow(deprecated)]
let ready = unsafe { ffi::PyUnicode_READY(ptr) };
let ready = ffi::PyUnicode_READY(ptr);
if ready != 0 {
// Exception was created on failure.
return Err(crate::PyErr::api_call_failed(self.py()));
Expand All @@ -230,20 +242,23 @@ impl PyString {
// The string should be in its canonical form after calling `PyUnicode_READY()`.
// And non-canonical form not possible after Python 3.12. So it should be safe
// to call these APIs.
let length = unsafe { ffi::PyUnicode_GET_LENGTH(ptr) } as usize;
let raw_data = unsafe { ffi::PyUnicode_DATA(ptr) };
let kind = unsafe { ffi::PyUnicode_KIND(ptr) };
let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
let raw_data = ffi::PyUnicode_DATA(ptr);
let kind = ffi::PyUnicode_KIND(ptr);

match kind {
ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(unsafe {
std::slice::from_raw_parts(raw_data as *const u8, length)
})),
ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(unsafe {
std::slice::from_raw_parts(raw_data as *const u16, length)
})),
ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(unsafe {
std::slice::from_raw_parts(raw_data as *const u32, length)
})),
ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
raw_data as *const u8,
length,
))),
ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
raw_data as *const u16,
length,
))),
ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
raw_data as *const u32,
length,
))),
_ => unreachable!(),
}
}
Expand Down Expand Up @@ -465,11 +480,11 @@ mod tests {
}

#[test]
#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
fn test_string_data_ucs1() {
Python::with_gil(|py| {
let s = PyString::new(py, "hello, world");
let data = s.data().unwrap();
let data = unsafe { s.data().unwrap() };

assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
Expand All @@ -478,7 +493,7 @@ mod tests {
}

#[test]
#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
fn test_string_data_ucs1_invalid() {
Python::with_gil(|py| {
// 0xfe is not allowed in UTF-8.
Expand All @@ -492,7 +507,7 @@ mod tests {
};
assert!(!ptr.is_null());
let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
let data = s.data().unwrap();
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
let err = data.to_string(py).unwrap_err();
assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));
Expand All @@ -504,12 +519,12 @@ mod tests {
}

#[test]
#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
fn test_string_data_ucs2() {
Python::with_gil(|py| {
let s = py.eval("'foo\\ud800'", None, None).unwrap();
let py_string = s.cast_as::<PyString>().unwrap();
let data = py_string.data().unwrap();
let data = unsafe { py_string.data().unwrap() };

assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
assert_eq!(
Expand All @@ -520,7 +535,7 @@ mod tests {
}

#[test]
#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
fn test_string_data_ucs2_invalid() {
Python::with_gil(|py| {
// U+FF22 (valid) & U+d800 (never valid)
Expand All @@ -534,7 +549,7 @@ mod tests {
};
assert!(!ptr.is_null());
let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
let data = s.data().unwrap();
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
let err = data.to_string(py).unwrap_err();
assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));
Expand All @@ -546,20 +561,20 @@ mod tests {
}

#[test]
#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
fn test_string_data_ucs4() {
Python::with_gil(|py| {
let s = "哈哈🐈";
let py_string = PyString::new(py, s);
let data = py_string.data().unwrap();
let data = unsafe { py_string.data().unwrap() };

assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
})
}

#[test]
#[cfg(not(Py_LIMITED_API))]
#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
fn test_string_data_ucs4_invalid() {
Python::with_gil(|py| {
// U+20000 (valid) & U+d800 (never valid)
Expand All @@ -573,7 +588,7 @@ mod tests {
};
assert!(!ptr.is_null());
let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
let data = s.data().unwrap();
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
let err = data.to_string(py).unwrap_err();
assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));
Expand Down