PyO3 · messense · Aug 27, 2021 · Aug 26, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - `PySequence::get_slice` now returns `PyResult<&PySequence>` instead of `PyResult<&PyAny>`. [#1829](https://github.com/PyO3/pyo3/pull/1829)
 - Deprecate `PyTuple::split_from`. [#1804](https://github.com/PyO3/pyo3/pull/1804)
 - Deprecate `PyTuple::slice`, new method `PyTuple::get_slice` added with `usize` indices. [#1828](https://github.com/PyO3/pyo3/pull/1828)
+- Mark `PyString::data` as `unsafe` and disable it and some supporting PyUnicode FFI APIs (which depend on a C bitfield) on big-endian targets. [#1834](https://github.com/PyO3/pyo3/pull/1834)
 
 ## [0.14.3] - 2021-08-22
 

diff --git a/src/ffi/cpython/unicodeobject.rs b/src/ffi/cpython/unicodeobject.rs
@@ -50,29 +50,34 @@ pub struct PyASCIIObject {
     pub wstr: *mut wchar_t,
 }
 
+/// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe.
+///
+/// In addition, they are disabled on big-endian architectures to restrict this to most "common"
+/// platforms, which are at least tested on CI and appear to be sound.
+#[cfg(not(target_endian = "big"))]
 impl PyASCIIObject {
     #[inline]
-    pub fn interned(&self) -> c_uint {
+    pub unsafe fn interned(&self) -> c_uint {
         self.state & 3
     }
 
     #[inline]
-    pub fn kind(&self) -> c_uint {
+    pub unsafe fn kind(&self) -> c_uint {
         (self.state >> 2) & 7
     }
 
     #[inline]
-    pub fn compact(&self) -> c_uint {
+    pub unsafe fn compact(&self) -> c_uint {
         (self.state >> 5) & 1
     }
 
     #[inline]
-    pub fn ascii(&self) -> c_uint {
+    pub unsafe fn ascii(&self) -> c_uint {
         (self.state >> 6) & 1
     }
 
     #[inline]
-    pub fn ready(&self) -> c_uint {
+    pub unsafe fn ready(&self) -> c_uint {
         (self.state >> 7) & 1
     }
 }
@@ -114,6 +119,7 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1;
 pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2;
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
     debug_assert!(PyUnicode_Check(op) != 0);
     debug_assert!(PyUnicode_IS_READY(op) != 0);
@@ -122,11 +128,13 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint {
     (*(op as *mut PyASCIIObject)).compact()
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint {
     if (*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0 {
         1
@@ -144,21 +152,25 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2;
 pub const PyUnicode_4BYTE_KIND: c_uint = 4;
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 {
     PyUnicode_DATA(op) as *mut Py_UCS1
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 {
     PyUnicode_DATA(op) as *mut Py_UCS2
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 {
     PyUnicode_DATA(op) as *mut Py_UCS4
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
     debug_assert!(PyUnicode_Check(op) != 0);
     debug_assert!(PyUnicode_IS_READY(op) != 0);
@@ -167,6 +179,7 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
     if PyUnicode_IS_ASCII(op) != 0 {
         (op as *mut PyASCIIObject).offset(1) as *mut c_void
@@ -176,13 +189,15 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void {
     debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null());
 
     (*(op as *mut PyUnicodeObject)).data.any
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
     debug_assert!(PyUnicode_Check(op) != 0);
 
@@ -206,13 +221,15 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
 }
 
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint {
     (*(op as *mut PyASCIIObject)).ready()
 }
 
 #[cfg(not(Py_3_12))]
 #[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))]
 #[inline]
+#[cfg(not(target_endian = "big"))]
 pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int {
     debug_assert!(PyUnicode_Check(op) != 0);
 
@@ -481,6 +498,7 @@ extern "C" {
 // skipped _PyUnicode_ScanIdentifier
 
 #[cfg(test)]
+#[cfg(not(target_endian = "big"))]
 mod tests {
     use super::*;
     use crate::types::PyString;
@@ -498,30 +516,32 @@ mod tests {
             wstr: std::ptr::null_mut() as *mut wchar_t,
         };
 
-        assert_eq!(o.interned(), 0);
-        assert_eq!(o.kind(), 0);
-        assert_eq!(o.compact(), 0);
-        assert_eq!(o.ascii(), 0);
-        assert_eq!(o.ready(), 0);
+        unsafe {
+            assert_eq!(o.interned(), 0);
+            assert_eq!(o.kind(), 0);
+            assert_eq!(o.compact(), 0);
+            assert_eq!(o.ascii(), 0);
+            assert_eq!(o.ready(), 0);
 
-        for i in 0..4 {
-            o.state = i;
-            assert_eq!(o.interned(), i);
-        }
+            for i in 0..4 {
+                o.state = i;
+                assert_eq!(o.interned(), i);
+            }
 
-        for i in 0..8 {
-            o.state = i << 2;
-            assert_eq!(o.kind(), i);
-        }
+            for i in 0..8 {
+                o.state = i << 2;
+                assert_eq!(o.kind(), i);
+            }
 
-        o.state = 1 << 5;
-        assert_eq!(o.compact(), 1);
+            o.state = 1 << 5;
+            assert_eq!(o.compact(), 1);
 
-        o.state = 1 << 6;
-        assert_eq!(o.ascii(), 1);
+            o.state = 1 << 6;
+            assert_eq!(o.ascii(), 1);
 
-        o.state = 1 << 7;
-        assert_eq!(o.ready(), 1);
+            o.state = 1 << 7;
+            assert_eq!(o.ready(), 1);
+        }
     }
 
     #[test]

diff --git a/src/types/string.rs b/src/types/string.rs
@@ -17,8 +17,8 @@ use std::str;
 ///
 /// Python internally stores strings in various representations. This enumeration
 /// represents those variations.
-#[cfg(not(Py_LIMITED_API))]
-#[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))]
+#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
+#[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))]
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub enum PyStringData<'a> {
     /// UCS1 representation.
@@ -31,7 +31,7 @@ pub enum PyStringData<'a> {
     Ucs4(&'a [u32]),
 }
 
-#[cfg(not(Py_LIMITED_API))]
+#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
 impl<'a> PyStringData<'a> {
     /// Obtain the raw bytes backing this instance as a [u8] slice.
     pub fn as_bytes(&self) -> &[u8] {
@@ -211,16 +211,28 @@ impl PyString {
 
     /// Obtains the raw data backing the Python string.
     ///
-    /// If the Python string object was created through legacy APIs, its internal
-    /// storage format will be canonicalized before data is returned.
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))]
-    pub fn data(&self) -> PyResult<PyStringData<'_>> {
+    /// If the Python string object was created through legacy APIs, its internal storage format
+    /// will be canonicalized before data is returned.
+    ///
+    /// # Safety
+    ///
+    /// This function implementation relies on manually decoding a C bitfield. In practice, this
+    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
+    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
+    /// x86_64 platforms.
+    ///
+    /// By using this API, you accept responsibility for testing that PyStringData behaves as
+    /// expected on the targets where you plan to distribute your software.
+    ///
+    /// For example, it is known not to work on big-endian platforms.
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
+    #[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))]
+    pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
         let ptr = self.as_ptr();
 
         if cfg!(not(Py_3_12)) {
             #[allow(deprecated)]
-            let ready = unsafe { ffi::PyUnicode_READY(ptr) };
+            let ready = ffi::PyUnicode_READY(ptr);
             if ready != 0 {
                 // Exception was created on failure.
                 return Err(crate::PyErr::api_call_failed(self.py()));
@@ -230,20 +242,23 @@ impl PyString {
         // The string should be in its canonical form after calling `PyUnicode_READY()`.
         // And non-canonical form not possible after Python 3.12. So it should be safe
         // to call these APIs.
-        let length = unsafe { ffi::PyUnicode_GET_LENGTH(ptr) } as usize;
-        let raw_data = unsafe { ffi::PyUnicode_DATA(ptr) };
-        let kind = unsafe { ffi::PyUnicode_KIND(ptr) };
+        let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
+        let raw_data = ffi::PyUnicode_DATA(ptr);
+        let kind = ffi::PyUnicode_KIND(ptr);
 
         match kind {
-            ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(unsafe {
-                std::slice::from_raw_parts(raw_data as *const u8, length)
-            })),
-            ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(unsafe {
-                std::slice::from_raw_parts(raw_data as *const u16, length)
-            })),
-            ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(unsafe {
-                std::slice::from_raw_parts(raw_data as *const u32, length)
-            })),
+            ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
+                raw_data as *const u8,
+                length,
+            ))),
+            ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
+                raw_data as *const u16,
+                length,
+            ))),
+            ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
+                raw_data as *const u32,
+                length,
+            ))),
             _ => unreachable!(),
         }
     }
@@ -465,11 +480,11 @@ mod tests {
     }
 
     #[test]
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
     fn test_string_data_ucs1() {
         Python::with_gil(|py| {
             let s = PyString::new(py, "hello, world");
-            let data = s.data().unwrap();
+            let data = unsafe { s.data().unwrap() };
 
             assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
             assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
@@ -478,7 +493,7 @@ mod tests {
     }
 
     #[test]
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
     fn test_string_data_ucs1_invalid() {
         Python::with_gil(|py| {
             // 0xfe is not allowed in UTF-8.
@@ -492,7 +507,7 @@ mod tests {
             };
             assert!(!ptr.is_null());
             let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
-            let data = s.data().unwrap();
+            let data = unsafe { s.data().unwrap() };
             assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
             let err = data.to_string(py).unwrap_err();
             assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));
@@ -504,12 +519,12 @@ mod tests {
     }
 
     #[test]
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
     fn test_string_data_ucs2() {
         Python::with_gil(|py| {
             let s = py.eval("'foo\\ud800'", None, None).unwrap();
             let py_string = s.cast_as::<PyString>().unwrap();
-            let data = py_string.data().unwrap();
+            let data = unsafe { py_string.data().unwrap() };
 
             assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
             assert_eq!(
@@ -520,7 +535,7 @@ mod tests {
     }
 
     #[test]
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
     fn test_string_data_ucs2_invalid() {
         Python::with_gil(|py| {
             // U+FF22 (valid) & U+d800 (never valid)
@@ -534,7 +549,7 @@ mod tests {
             };
             assert!(!ptr.is_null());
             let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
-            let data = s.data().unwrap();
+            let data = unsafe { s.data().unwrap() };
             assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
             let err = data.to_string(py).unwrap_err();
             assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));
@@ -546,20 +561,20 @@ mod tests {
     }
 
     #[test]
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
     fn test_string_data_ucs4() {
         Python::with_gil(|py| {
             let s = "哈哈🐈";
             let py_string = PyString::new(py, s);
-            let data = py_string.data().unwrap();
+            let data = unsafe { py_string.data().unwrap() };
 
             assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
             assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
         })
     }
 
     #[test]
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))]
     fn test_string_data_ucs4_invalid() {
         Python::with_gil(|py| {
             // U+20000 (valid) & U+d800 (never valid)
@@ -573,7 +588,7 @@ mod tests {
             };
             assert!(!ptr.is_null());
             let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
-            let data = s.data().unwrap();
+            let data = unsafe { s.data().unwrap() };
             assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
             let err = data.to_string(py).unwrap_err();
             assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));