Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of invalid utf-8 sequences in PyString::to_string_lossy #642

Merged
merged 1 commit into from
Oct 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

* FFI compatibility for PEP 590 Vectorcall.

### Fixed

* Fix handling of invalid utf-8 sequences in `PyString::as_bytes` [#639](https://github.com/PyO3/pyo3/pull/639)
and `PyString::to_string_lossy` [#642](https://github.com/PyO3/pyo3/pull/642).

## [0.8.1]

### Added
Expand Down
5 changes: 5 additions & 0 deletions examples/rustapi_module/src/buf_and_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ impl BytesExtractor {
let rust_string: String = string.extract().unwrap();
Ok(rust_string.len())
}

pub fn from_str_lossy(&mut self, string: &PyString) -> PyResult<usize> {
let rust_string_lossy: String = string.to_string_lossy().to_string();
Ok(rust_string_lossy.len())
}
}

#[pymodule]
Expand Down
6 changes: 6 additions & 0 deletions examples/rustapi_module/tests/test_buf_and_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,22 @@ def memory_diff(f):

message_b = b'\\(-"-;) Praying that memory leak would not happen..'
message_s = '\\(-"-;) Praying that memory leak would not happen..'
message_surrogate = '\\(-"-;) Praying that memory leak would not happen.. \ud800'

def from_bytes():
extractor.from_bytes(message_b)

def from_str():
extractor.from_str(message_s)

def from_str_lossy():
extractor.from_str_lossy(message_surrogate)

# Running the memory_diff to warm-up the garbage collector
memory_diff(from_bytes)
memory_diff(from_str)
memory_diff(from_str_lossy)

assert memory_diff(from_bytes) == 0
assert memory_diff(from_str) == 0
assert memory_diff(from_str_lossy) == 0
42 changes: 38 additions & 4 deletions src/types/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use crate::conversion::FromPyObject;
use crate::conversion::{PyTryFrom, ToPyObject};
use crate::err::{PyErr, PyResult};
use crate::gil;
use crate::instance::PyNativeType;
use crate::object::PyObject;
use crate::types::PyAny;
Expand All @@ -11,8 +12,10 @@ use crate::IntoPy;
use crate::Python;
use crate::{ffi, FromPy};
use std::borrow::Cow;
use std::ffi::CStr;
use std::ops::Index;
use std::os::raw::c_char;
use std::ptr::NonNull;
use std::slice::SliceIndex;
use std::str;

Expand Down Expand Up @@ -87,10 +90,29 @@ impl PyString {
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(&self) -> Cow<str> {
// TODO: Handle error of `as_bytes`
// see https://github.com/PyO3/pyo3/pull/634
let bytes = self.as_bytes().unwrap();
String::from_utf8_lossy(bytes)
match self.to_string() {
Ok(s) => s,
Err(_) => {
unsafe {
let py_bytes = ffi::PyUnicode_AsEncodedString(
self.0.as_ptr(),
CStr::from_bytes_with_nul(b"utf-8\0").unwrap().as_ptr(),
CStr::from_bytes_with_nul(b"surrogatepass\0")
.unwrap()
.as_ptr(),
);
// Since we have a valid PyString and replace any surrogates, assume success.
debug_assert!(!py_bytes.is_null());
// ensure DECREF will be called
gil::register_pointer(NonNull::new(py_bytes).unwrap());
let buffer = ffi::PyBytes_AsString(py_bytes) as *const u8;
debug_assert!(!buffer.is_null());
let length = ffi::PyBytes_Size(py_bytes) as usize;
let bytes = std::slice::from_raw_parts(buffer, length);
String::from_utf8_lossy(bytes)
}
}
}
}
}

Expand Down Expand Up @@ -308,6 +330,18 @@ mod test {
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}

#[test]
fn test_to_string_lossy() {
let gil = Python::acquire_gil();
let py = gil.python();
let obj: PyObject = py
.eval(r#"'🐈 Hello \ud800World'"#, None, None)
.unwrap()
.into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
}

#[test]
fn test_bytes_index() {
let gil = Python::acquire_gil();
Expand Down