From 131b151a778f7613559a3db4c984eabcfa0fc676 Mon Sep 17 00:00:00 2001 From: David Hewitt <1939362+davidhewitt@users.noreply.github.com> Date: Thu, 26 Aug 2021 19:29:20 +0100 Subject: [PATCH 1/3] pystring: disable `PyString::data` on big-endian targets --- CHANGELOG.md | 6 +++ src/ffi/cpython/unicodeobject.rs | 68 +++++++++++++++++---------- src/types/string.rs | 81 +++++++++++++++++++------------- 3 files changed, 98 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad4a980e6cd..802be1ddb88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ PyO3 versions, please see the [migration guide](https://pyo3.rs/latest/migration The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Changed + +- Mark `PyString::data` as `unsafe` and disable it and some supporting PyUnicode FFI APIs (which depend on a C bitfield) on big-endian targets. [#1834](https://github.com/PyO3/pyo3/pull/1834) + ## [0.14.3] - 2021-08-22 ### Added diff --git a/src/ffi/cpython/unicodeobject.rs b/src/ffi/cpython/unicodeobject.rs index 7c61243513b..1fbd1aac08f 100644 --- a/src/ffi/cpython/unicodeobject.rs +++ b/src/ffi/cpython/unicodeobject.rs @@ -50,29 +50,34 @@ pub struct PyASCIIObject { pub wstr: *mut wchar_t, } +/// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe. +/// +/// In addition, they are disabled on big-endian architectures to restrict this to most "common" +/// platforms, which are at least tested on CI and appear to be sound. +#[cfg(not(target_endian = "big"))] impl PyASCIIObject { #[inline] - pub fn interned(&self) -> c_uint { + pub unsafe fn interned(&self) -> c_uint { self.state & 3 } #[inline] - pub fn kind(&self) -> c_uint { + pub unsafe fn kind(&self) -> c_uint { (self.state >> 2) & 7 } #[inline] - pub fn compact(&self) -> c_uint { + pub unsafe fn compact(&self) -> c_uint { (self.state >> 5) & 1 } #[inline] - pub fn ascii(&self) -> c_uint { + pub unsafe fn ascii(&self) -> c_uint { (self.state >> 6) & 1 } #[inline] - pub fn ready(&self) -> c_uint { + pub unsafe fn ready(&self) -> c_uint { (self.state >> 7) & 1 } } @@ -114,6 +119,7 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1; pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2; #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { debug_assert!(PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -122,11 +128,13 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint { (*(op as *mut PyASCIIObject)).compact() } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint { if (*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0 { 1 @@ -144,21 +152,25 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2; pub const PyUnicode_4BYTE_KIND: c_uint = 4; #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 { PyUnicode_DATA(op) as *mut Py_UCS1 } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 { PyUnicode_DATA(op) as *mut Py_UCS2 } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 { PyUnicode_DATA(op) as *mut Py_UCS4 } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { debug_assert!(PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -167,6 +179,7 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { if PyUnicode_IS_ASCII(op) != 0 { (op as *mut PyASCIIObject).offset(1) as *mut c_void @@ -176,6 +189,7 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null()); @@ -183,6 +197,7 @@ pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void { debug_assert!(PyUnicode_Check(op) != 0); @@ -206,6 +221,7 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { (*(op as *mut PyASCIIObject)).ready() } @@ -213,6 +229,7 @@ pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { #[cfg(not(Py_3_12))] #[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))] #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int { debug_assert!(PyUnicode_Check(op) != 0); @@ -481,6 +498,7 @@ extern "C" { // skipped _PyUnicode_ScanIdentifier #[cfg(test)] +#[cfg(not(target_endian = "big"))] mod tests { use super::*; use crate::types::PyString; @@ -498,30 +516,32 @@ mod tests { wstr: std::ptr::null_mut() as *mut wchar_t, }; - assert_eq!(o.interned(), 0); - assert_eq!(o.kind(), 0); - assert_eq!(o.compact(), 0); - assert_eq!(o.ascii(), 0); - assert_eq!(o.ready(), 0); + unsafe { + assert_eq!(o.interned(), 0); + assert_eq!(o.kind(), 0); + assert_eq!(o.compact(), 0); + assert_eq!(o.ascii(), 0); + assert_eq!(o.ready(), 0); - for i in 0..4 { - o.state = i; - assert_eq!(o.interned(), i); - } + for i in 0..4 { + o.state = i; + assert_eq!(o.interned(), i); + } - for i in 0..8 { - o.state = i << 2; - assert_eq!(o.kind(), i); - } + for i in 0..8 { + o.state = i << 2; + assert_eq!(o.kind(), i); + } - o.state = 1 << 5; - assert_eq!(o.compact(), 1); + o.state = 1 << 5; + assert_eq!(o.compact(), 1); - o.state = 1 << 6; - assert_eq!(o.ascii(), 1); + o.state = 1 << 6; + assert_eq!(o.ascii(), 1); - o.state = 1 << 7; - assert_eq!(o.ready(), 1); + o.state = 1 << 7; + assert_eq!(o.ready(), 1); + } } #[test] diff --git a/src/types/string.rs b/src/types/string.rs index 9ac1a149839..520f584578e 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -17,8 +17,8 @@ use std::str; /// /// Python internally stores strings in various representations. This enumeration /// represents those variations. -#[cfg(not(Py_LIMITED_API))] -#[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))] +#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] +#[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))] #[derive(Clone, Copy, Debug, PartialEq)] pub enum PyStringData<'a> { /// UCS1 representation. @@ -31,7 +31,7 @@ pub enum PyStringData<'a> { Ucs4(&'a [u32]), } -#[cfg(not(Py_LIMITED_API))] +#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] impl<'a> PyStringData<'a> { /// Obtain the raw bytes backing this instance as a [u8] slice. pub fn as_bytes(&self) -> &[u8] { @@ -211,16 +211,28 @@ impl PyString { /// Obtains the raw data backing the Python string. /// - /// If the Python string object was created through legacy APIs, its internal - /// storage format will be canonicalized before data is returned. - #[cfg(not(Py_LIMITED_API))] - #[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))] - pub fn data(&self) -> PyResult> { + /// If the Python string object was created through legacy APIs, its internal storage format + /// will be canonicalized before data is returned. + /// + /// # Safety + /// + /// This function implementation relies on manually decoding a C bitfield. In practice, this + /// works well on common little-endian architectures such as x86_64, where the bitfield has a + /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on + /// x86_64 platforms. + /// + /// By using this API, you accept responsibility for testing that PyStringData behaves as + /// expected on the targets where you plan to distribute your software. + /// + /// For example, it is known not to work on big-endian platforms. + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] + #[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))] + pub unsafe fn data(&self) -> PyResult> { let ptr = self.as_ptr(); if cfg!(not(Py_3_12)) { #[allow(deprecated)] - let ready = unsafe { ffi::PyUnicode_READY(ptr) }; + let ready = ffi::PyUnicode_READY(ptr); if ready != 0 { // Exception was created on failure. return Err(crate::PyErr::fetch(self.py())); @@ -230,20 +242,23 @@ impl PyString { // The string should be in its canonical form after calling `PyUnicode_READY()`. // And non-canonical form not possible after Python 3.12. So it should be safe // to call these APIs. - let length = unsafe { ffi::PyUnicode_GET_LENGTH(ptr) } as usize; - let raw_data = unsafe { ffi::PyUnicode_DATA(ptr) }; - let kind = unsafe { ffi::PyUnicode_KIND(ptr) }; + let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize; + let raw_data = ffi::PyUnicode_DATA(ptr); + let kind = ffi::PyUnicode_KIND(ptr); match kind { - ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(unsafe { - std::slice::from_raw_parts(raw_data as *const u8, length) - })), - ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(unsafe { - std::slice::from_raw_parts(raw_data as *const u16, length) - })), - ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(unsafe { - std::slice::from_raw_parts(raw_data as *const u32, length) - })), + ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts( + raw_data as *const u8, + length, + ))), + ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts( + raw_data as *const u16, + length, + ))), + ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts( + raw_data as *const u32, + length, + ))), _ => unreachable!(), } } @@ -465,11 +480,11 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs1() { Python::with_gil(|py| { let s = PyString::new(py, "hello, world"); - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs1(b"hello, world")); assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world")); @@ -478,7 +493,7 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs1_invalid() { Python::with_gil(|py| { // 0xfe is not allowed in UTF-8. @@ -492,7 +507,7 @@ mod tests { }; assert!(!ptr.is_null()); let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs1(b"f\xfe")); let err = data.to_string(py).unwrap_err(); assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py)); @@ -504,12 +519,12 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs2() { Python::with_gil(|py| { let s = py.eval("'foo\\ud800'", None, None).unwrap(); let py_string = s.cast_as::().unwrap(); - let data = py_string.data().unwrap(); + let data = unsafe { py_string.data().unwrap() }; assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800])); assert_eq!( @@ -520,7 +535,7 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs2_invalid() { Python::with_gil(|py| { // U+FF22 (valid) & U+d800 (never valid) @@ -534,7 +549,7 @@ mod tests { }; assert!(!ptr.is_null()); let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800])); let err = data.to_string(py).unwrap_err(); assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py)); @@ -546,12 +561,12 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs4() { Python::with_gil(|py| { let s = "哈哈🐈"; let py_string = PyString::new(py, s); - let data = py_string.data().unwrap(); + let data = unsafe { py_string.data().unwrap() }; assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008])); assert_eq!(data.to_string_lossy(), Cow::Owned::(s.to_string())); @@ -559,7 +574,7 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs4_invalid() { Python::with_gil(|py| { // U+20000 (valid) & U+d800 (never valid) @@ -573,7 +588,7 @@ mod tests { }; assert!(!ptr.is_null()); let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800])); let err = data.to_string(py).unwrap_err(); assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py)); From fa9254384856d3f7a16212fdaa2b66845b574532 Mon Sep 17 00:00:00 2001 From: David Hewitt <1939362+davidhewitt@users.noreply.github.com> Date: Sat, 28 Aug 2021 08:12:47 +0100 Subject: [PATCH 2/3] ci: pin parking_lot to 0.11.1 on MSRV --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b671a5a65df..9e13fccbb60 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,6 +138,8 @@ jobs: cargo update -p indexmap --precise 1.6.2 cargo update -p hashbrown:0.11.2 --precise 0.9.1 cargo update -p bitflags --precise 1.2.1 + cargo update -p parking_lot --precise 0.11.1 + cargo update -p parking_lot_core --precise 0.8.3 - name: Build docs run: cargo doc --no-deps --no-default-features --features "${{ steps.settings.outputs.all_additive_features }}" From d544c07bbca5e63bd0e719684520dc116015aad7 Mon Sep 17 00:00:00 2001 From: David Hewitt <1939362+davidhewitt@users.noreply.github.com> Date: Sat, 28 Aug 2021 08:52:14 +0100 Subject: [PATCH 3/3] release: 0.14.4 --- CHANGELOG.md | 5 +++-- Cargo.toml | 6 +++--- README.md | 4 ++-- pyo3-build-config/Cargo.toml | 2 +- pyo3-macros-backend/Cargo.toml | 4 ++-- pyo3-macros/Cargo.toml | 4 ++-- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 802be1ddb88..51c849ac960 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ PyO3 versions, please see the [migration guide](https://pyo3.rs/latest/migration The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.14.4] - 2021-08-29 ### Changed @@ -902,7 +902,8 @@ Yanked - Initial release -[unreleased]: https://github.com/pyo3/pyo3/compare/v0.14.3...HEAD +[unreleased]: https://github.com/pyo3/pyo3/compare/v0.14.4...HEAD +[0.14.4]: https://github.com/pyo3/pyo3/compare/v0.14.3...v0.14.4 [0.14.3]: https://github.com/pyo3/pyo3/compare/v0.14.2...v0.14.3 [0.14.2]: https://github.com/pyo3/pyo3/compare/v0.14.1...v0.14.2 [0.14.1]: https://github.com/pyo3/pyo3/compare/v0.14.0...v0.14.1 diff --git a/Cargo.toml b/Cargo.toml index 386de8df9eb..a3b22e7894c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pyo3" -version = "0.14.3" +version = "0.14.4" description = "Bindings to Python interpreter" authors = ["PyO3 Project and Contributors "] readme = "README.md" @@ -24,7 +24,7 @@ num-bigint = { version = "0.4", optional = true } num-complex = { version = ">= 0.2, < 0.5", optional = true } # must stay at 0.1.x for Rust 1.41 compatibility paste = { version = "0.1.18", optional = true } -pyo3-macros = { path = "pyo3-macros", version = "=0.14.3", optional = true } +pyo3-macros = { path = "pyo3-macros", version = "=0.14.4", optional = true } unindent = { version = "0.1.4", optional = true } hashbrown = { version = ">= 0.9, < 0.12", optional = true } indexmap = { version = ">= 1.6, < 1.8", optional = true } @@ -42,7 +42,7 @@ pyo3 = { path = ".", default-features = false, features = ["macros", "auto-initi serde_json = "1.0.61" [build-dependencies] -pyo3-build-config = { path = "pyo3-build-config", version = "0.14.3" } +pyo3-build-config = { path = "pyo3-build-config", version = "0.14.4" } [features] default = ["macros"] diff --git a/README.md b/README.md index ce14db4969a..5d4f45d4641 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ name = "string_sum" crate-type = ["cdylib"] [dependencies.pyo3] -version = "0.14.3" +version = "0.14.4" features = ["extension-module"] ``` @@ -108,7 +108,7 @@ Start a new project with `cargo new` and add `pyo3` to the `Cargo.toml` like th ```toml [dependencies.pyo3] -version = "0.14.3" +version = "0.14.4" features = ["auto-initialize"] ``` diff --git a/pyo3-build-config/Cargo.toml b/pyo3-build-config/Cargo.toml index dc1de0d482c..5dbbe638c4c 100644 --- a/pyo3-build-config/Cargo.toml +++ b/pyo3-build-config/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pyo3-build-config" -version = "0.14.3" +version = "0.14.4" description = "Build configuration for the PyO3 ecosystem" authors = ["PyO3 Project and Contributors "] keywords = ["pyo3", "python", "cpython", "ffi"] diff --git a/pyo3-macros-backend/Cargo.toml b/pyo3-macros-backend/Cargo.toml index ebe108984aa..0d3fc54f58b 100644 --- a/pyo3-macros-backend/Cargo.toml +++ b/pyo3-macros-backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pyo3-macros-backend" -version = "0.14.3" +version = "0.14.4" description = "Code generation for PyO3 package" authors = ["PyO3 Project and Contributors "] keywords = ["pyo3", "python", "cpython", "ffi"] @@ -16,7 +16,7 @@ edition = "2018" [dependencies] quote = { version = "1", default-features = false } proc-macro2 = { version = "1", default-features = false } -pyo3-build-config = { path = "../pyo3-build-config", version = "0.14.3" } +pyo3-build-config = { path = "../pyo3-build-config", version = "0.14.4" } [dependencies.syn] version = "1" diff --git a/pyo3-macros/Cargo.toml b/pyo3-macros/Cargo.toml index a14589977ff..81d8db4370c 100644 --- a/pyo3-macros/Cargo.toml +++ b/pyo3-macros/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pyo3-macros" -version = "0.14.3" +version = "0.14.4" description = "Proc macros for PyO3 package" authors = ["PyO3 Project and Contributors "] keywords = ["pyo3", "python", "cpython", "ffi"] @@ -16,4 +16,4 @@ proc-macro = true [dependencies] quote = "1" syn = { version = "1", features = ["full", "extra-traits"] } -pyo3-macros-backend = { path = "../pyo3-macros-backend", version = "=0.14.3" } +pyo3-macros-backend = { path = "../pyo3-macros-backend", version = "=0.14.4" }