Skip to content

Commit

Permalink
Merge pull request #642 from Alexander-N/to_string_lossy
Browse files Browse the repository at this point in the history
Fix handling of invalid utf-8 sequences in PyString::to_string_lossy
  • Loading branch information
kngwyu authored Oct 25, 2019
2 parents 34843d7 + defa430 commit 30485f3
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 4 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

* FFI compatibility for PEP 590 Vectorcall.

### Fixed

* Fix handling of invalid utf-8 sequences in `PyString::as_bytes` [#639](https://github.com/PyO3/pyo3/pull/639)
and `PyString::to_string_lossy` [#642](https://github.com/PyO3/pyo3/pull/642).

## [0.8.1]

### Added
Expand Down
5 changes: 5 additions & 0 deletions examples/rustapi_module/src/buf_and_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ impl BytesExtractor {
let rust_string: String = string.extract().unwrap();
Ok(rust_string.len())
}

pub fn from_str_lossy(&mut self, string: &PyString) -> PyResult<usize> {
let rust_string_lossy: String = string.to_string_lossy().to_string();
Ok(rust_string_lossy.len())
}
}

#[pymodule]
Expand Down
6 changes: 6 additions & 0 deletions examples/rustapi_module/tests/test_buf_and_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,22 @@ def memory_diff(f):

message_b = b'\\(-"-;) Praying that memory leak would not happen..'
message_s = '\\(-"-;) Praying that memory leak would not happen..'
message_surrogate = '\\(-"-;) Praying that memory leak would not happen.. \ud800'

def from_bytes():
extractor.from_bytes(message_b)

def from_str():
extractor.from_str(message_s)

def from_str_lossy():
extractor.from_str_lossy(message_surrogate)

# Running the memory_diff to warm-up the garbage collector
memory_diff(from_bytes)
memory_diff(from_str)
memory_diff(from_str_lossy)

assert memory_diff(from_bytes) == 0
assert memory_diff(from_str) == 0
assert memory_diff(from_str_lossy) == 0
42 changes: 38 additions & 4 deletions src/types/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use crate::conversion::FromPyObject;
use crate::conversion::{PyTryFrom, ToPyObject};
use crate::err::{PyErr, PyResult};
use crate::gil;
use crate::instance::PyNativeType;
use crate::object::PyObject;
use crate::types::PyAny;
Expand All @@ -11,8 +12,10 @@ use crate::IntoPy;
use crate::Python;
use crate::{ffi, FromPy};
use std::borrow::Cow;
use std::ffi::CStr;
use std::ops::Index;
use std::os::raw::c_char;
use std::ptr::NonNull;
use std::slice::SliceIndex;
use std::str;

Expand Down Expand Up @@ -87,10 +90,29 @@ impl PyString {
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(&self) -> Cow<str> {
// TODO: Handle error of `as_bytes`
// see https://github.com/PyO3/pyo3/pull/634
let bytes = self.as_bytes().unwrap();
String::from_utf8_lossy(bytes)
match self.to_string() {
Ok(s) => s,
Err(_) => {
unsafe {
let py_bytes = ffi::PyUnicode_AsEncodedString(
self.0.as_ptr(),
CStr::from_bytes_with_nul(b"utf-8\0").unwrap().as_ptr(),
CStr::from_bytes_with_nul(b"surrogatepass\0")
.unwrap()
.as_ptr(),
);
// Since we have a valid PyString and replace any surrogates, assume success.
debug_assert!(!py_bytes.is_null());
// ensure DECREF will be called
gil::register_pointer(NonNull::new(py_bytes).unwrap());
let buffer = ffi::PyBytes_AsString(py_bytes) as *const u8;
debug_assert!(!buffer.is_null());
let length = ffi::PyBytes_Size(py_bytes) as usize;
let bytes = std::slice::from_raw_parts(buffer, length);
String::from_utf8_lossy(bytes)
}
}
}
}
}

Expand Down Expand Up @@ -308,6 +330,18 @@ mod test {
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}

#[test]
fn test_to_string_lossy() {
let gil = Python::acquire_gil();
let py = gil.python();
let obj: PyObject = py
.eval(r#"'🐈 Hello \ud800World'"#, None, None)
.unwrap()
.into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
}

#[test]
fn test_bytes_index() {
let gil = Python::acquire_gil();
Expand Down

0 comments on commit 30485f3

Please sign in to comment.