Skip to content

Commit 0ce656c

Browse files
committed
Add ensure_ascii option
1 parent 9a25aa6 commit 0ce656c

File tree

6 files changed

+144
-12
lines changed

6 files changed

+144
-12
lines changed

Diff for: python/pydantic_core/_pydantic_core.pyi

+6
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@ class SchemaSerializer:
345345
value: Any,
346346
*,
347347
indent: int | None = None,
348+
ensure_ascii: bool = False,
348349
include: _IncEx | None = None,
349350
exclude: _IncEx | None = None,
350351
by_alias: bool | None = None,
@@ -363,6 +364,8 @@ class SchemaSerializer:
363364
Arguments:
364365
value: The Python object to serialize.
365366
indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided.
367+
ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped.
368+
If `False` (the default), these characters will be outputted as-is.
366369
include: A set of fields to include, if `None` all fields are included.
367370
exclude: A set of fields to exclude, if `None` no fields are excluded.
368371
by_alias: Whether to use the alias names of fields.
@@ -390,6 +393,7 @@ def to_json(
390393
value: Any,
391394
*,
392395
indent: int | None = None,
396+
ensure_ascii: bool = False,
393397
include: _IncEx | None = None,
394398
exclude: _IncEx | None = None,
395399
# Note: In Pydantic 2.11, the default value of `by_alias` on `SchemaSerializer` was changed from `True` to `None`,
@@ -414,6 +418,8 @@ def to_json(
414418
Arguments:
415419
value: The Python object to serialize.
416420
indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided.
421+
ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped.
422+
If `False` (the default), these characters will be outputted as-is.
417423
include: A set of fields to include, if `None` all fields are included.
418424
exclude: A set of fields to exclude, if `None` no fields are excluded.
419425
by_alias: Whether to use the alias names of fields.

Diff for: src/serializers/mod.rs

+16-4
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,15 @@ impl SchemaSerializer {
155155
}
156156

157157
#[allow(clippy::too_many_arguments)]
158-
#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = None,
158+
#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None, by_alias = None,
159159
exclude_unset = false, exclude_defaults = false, exclude_none = false, round_trip = false, warnings = WarningsArg::Bool(true),
160160
fallback = None, serialize_as_any = false, context = None))]
161161
pub fn to_json(
162162
&self,
163163
py: Python,
164164
value: &Bound<'_, PyAny>,
165165
indent: Option<usize>,
166+
ensure_ascii: Option<bool>,
166167
include: Option<&Bound<'_, PyAny>>,
167168
exclude: Option<&Bound<'_, PyAny>>,
168169
by_alias: Option<bool>,
@@ -204,6 +205,7 @@ impl SchemaSerializer {
204205
exclude,
205206
&extra,
206207
indent,
208+
ensure_ascii.unwrap_or(false),
207209
self.expected_json_size.load(Ordering::Relaxed),
208210
)?;
209211

@@ -239,14 +241,15 @@ impl SchemaSerializer {
239241

240242
#[allow(clippy::too_many_arguments)]
241243
#[pyfunction]
242-
#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = true,
243-
exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8",
244+
#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None,
245+
by_alias = true, exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8",
244246
inf_nan_mode = "constants", serialize_unknown = false, fallback = None, serialize_as_any = false,
245247
context = None))]
246248
pub fn to_json(
247249
py: Python,
248250
value: &Bound<'_, PyAny>,
249251
indent: Option<usize>,
252+
ensure_ascii: Option<bool>,
250253
include: Option<&Bound<'_, PyAny>>,
251254
exclude: Option<&Bound<'_, PyAny>>,
252255
by_alias: bool,
@@ -274,7 +277,16 @@ pub fn to_json(
274277
context,
275278
);
276279
let serializer = type_serializers::any::AnySerializer.into();
277-
let bytes = to_json_bytes(value, &serializer, include, exclude, &extra, indent, 1024)?;
280+
let bytes = to_json_bytes(
281+
value,
282+
&serializer,
283+
include,
284+
exclude,
285+
&extra,
286+
indent,
287+
ensure_ascii.unwrap_or(false),
288+
1024,
289+
)?;
278290
state.final_check(py)?;
279291
let py_bytes = PyBytes::new(py, &bytes);
280292
Ok(py_bytes.into())

Diff for: src/serializers/shared.rs

+84-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::borrow::Cow;
22
use std::fmt::Debug;
3+
use std::io::{self, Write};
34

45
use pyo3::exceptions::PyTypeError;
56
use pyo3::prelude::*;
@@ -9,7 +10,7 @@ use pyo3::{intern, PyTraverseError, PyVisit};
910

1011
use enum_dispatch::enum_dispatch;
1112
use serde::Serialize;
12-
use serde_json::ser::PrettyFormatter;
13+
use serde_json::ser::{Formatter, PrettyFormatter};
1314

1415
use crate::build_tools::py_schema_err;
1516
use crate::build_tools::py_schema_error_type;
@@ -349,6 +350,70 @@ impl Serialize for PydanticSerializer<'_> {
349350
}
350351
}
351352

353+
struct EscapeNonAsciiFormatter;
354+
355+
impl Formatter for EscapeNonAsciiFormatter {
356+
fn write_string_fragment<W: ?Sized + Write>(&mut self, writer: &mut W, fragment: &str) -> io::Result<()> {
357+
for ch in fragment.chars() {
358+
if ch.is_ascii() {
359+
writer.write_all(ch.encode_utf8(&mut [0; 4]).as_bytes())?;
360+
} else {
361+
for escape in ch.encode_utf16(&mut [0; 2]) {
362+
write!(writer, "\\u{escape:04x}")?;
363+
}
364+
}
365+
}
366+
Ok(())
367+
}
368+
}
369+
370+
struct EscapeNonAsciiPrettyFormatter<'a> {
371+
pretty: PrettyFormatter<'a>,
372+
escape_non_ascii: EscapeNonAsciiFormatter,
373+
}
374+
375+
impl<'a> EscapeNonAsciiPrettyFormatter<'a> {
376+
pub fn with_indent(indent: &'a [u8]) -> Self {
377+
Self {
378+
pretty: PrettyFormatter::with_indent(indent),
379+
escape_non_ascii: EscapeNonAsciiFormatter,
380+
}
381+
}
382+
}
383+
384+
macro_rules! defer {
385+
($formatter:ident, $fun:ident) => {
386+
fn $fun<W>(&mut self, writer: &mut W) -> io::Result<()>
387+
where
388+
W: ?Sized + io::Write,
389+
{
390+
self.$formatter.$fun(writer)
391+
}
392+
};
393+
($formatter:ident, $fun:ident, $val:ty) => {
394+
fn $fun<W>(&mut self, writer: &mut W, val: $val) -> io::Result<()>
395+
where
396+
W: ?Sized + io::Write,
397+
{
398+
self.$formatter.$fun(writer, val)
399+
}
400+
};
401+
}
402+
403+
impl<'a> Formatter for EscapeNonAsciiPrettyFormatter<'a> {
404+
defer!(escape_non_ascii, write_string_fragment, &str);
405+
defer!(pretty, begin_array);
406+
defer!(pretty, end_array);
407+
defer!(pretty, begin_array_value, bool);
408+
defer!(pretty, end_array_value);
409+
defer!(pretty, begin_object);
410+
defer!(pretty, end_object);
411+
defer!(pretty, begin_object_key, bool);
412+
defer!(pretty, end_object_key);
413+
defer!(pretty, begin_object_value);
414+
defer!(pretty, end_object_value);
415+
}
416+
352417
#[allow(clippy::too_many_arguments)]
353418
pub(crate) fn to_json_bytes(
354419
value: &Bound<'_, PyAny>,
@@ -357,25 +422,40 @@ pub(crate) fn to_json_bytes(
357422
exclude: Option<&Bound<'_, PyAny>>,
358423
extra: &Extra,
359424
indent: Option<usize>,
425+
ensure_ascii: bool,
360426
expected_json_size: usize,
361427
) -> PyResult<Vec<u8>> {
362428
let serializer = PydanticSerializer::new(value, serializer, include, exclude, extra);
363429

364430
let writer: Vec<u8> = Vec::with_capacity(expected_json_size);
365-
let bytes = match indent {
366-
Some(indent) => {
431+
432+
let bytes = match (indent, ensure_ascii) {
433+
(Some(indent), true) => {
434+
let indent = vec![b' '; indent];
435+
let formatter = EscapeNonAsciiPrettyFormatter::with_indent(&indent);
436+
let mut ser = PythonSerializer::with_formatter(writer, formatter);
437+
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
438+
ser.into_inner()
439+
}
440+
(Some(indent), false) => {
367441
let indent = vec![b' '; indent];
368442
let formatter = PrettyFormatter::with_indent(&indent);
369443
let mut ser = PythonSerializer::with_formatter(writer, formatter);
370444
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
371445
ser.into_inner()
372446
}
373-
None => {
447+
(None, true) => {
448+
let mut ser = PythonSerializer::with_formatter(writer, EscapeNonAsciiFormatter);
449+
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
450+
ser.into_inner()
451+
}
452+
(None, false) => {
374453
let mut ser = PythonSerializer::new(writer);
375454
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
376455
ser.into_inner()
377456
}
378457
};
458+
379459
Ok(bytes)
380460
}
381461

Diff for: src/serializers/type_serializers/json.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ impl TypeSerializer for JsonSerializer {
5454
extra: &Extra,
5555
) -> PyResult<PyObject> {
5656
if extra.round_trip {
57-
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0)?;
57+
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)?;
5858
let py = value.py();
5959
let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?;
6060
Ok(PyString::new(py, s).into())
@@ -65,7 +65,7 @@ impl TypeSerializer for JsonSerializer {
6565

6666
fn json_key<'a>(&self, key: &'a Bound<'_, PyAny>, extra: &Extra) -> PyResult<Cow<'a, str>> {
6767
if extra.round_trip {
68-
let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, 0)?;
68+
let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, false, 0)?;
6969
let py = key.py();
7070
let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?;
7171
Ok(Cow::Owned(s.to_string()))
@@ -83,8 +83,8 @@ impl TypeSerializer for JsonSerializer {
8383
extra: &Extra,
8484
) -> Result<S::Ok, S::Error> {
8585
if extra.round_trip {
86-
let bytes =
87-
to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0).map_err(py_err_se_err)?;
86+
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)
87+
.map_err(py_err_se_err)?;
8888
match from_utf8(&bytes) {
8989
Ok(s) => serializer.serialize_str(s),
9090
Err(e) => Err(Error::custom(e.to_string())),

Diff for: tests/serializers/test_string.py

+32
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,38 @@ def test_str():
2323
assert json.loads(json_emoji) == 'emoji 💩'
2424

2525

26+
# Tests borrowed from:
27+
# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_encode_basestring_ascii.py
28+
# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_unicode.py
29+
@pytest.mark.parametrize(
30+
['input', 'expected'],
31+
[
32+
(
33+
'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?',
34+
'"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"',
35+
),
36+
('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
37+
('controls', '"controls"'),
38+
('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
39+
(
40+
'{"object with 1 member":["array with 1 element"]}',
41+
'"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"',
42+
),
43+
(' s p a c e d ', '" s p a c e d "'),
44+
('\U0001d120', '"\\ud834\\udd20"'),
45+
('\u03b1\u03a9', '"\\u03b1\\u03a9"'),
46+
("`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
47+
('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
48+
('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
49+
('\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}', '"\\u03b1\\u03a9"'),
50+
('\U0001d120', '"\\ud834\\udd20"'),
51+
],
52+
)
53+
def test_str_ensure_ascii(input: str, expected: str) -> None:
54+
v = SchemaSerializer(core_schema.str_schema())
55+
assert v.to_json(input, ensure_ascii=True).decode('utf-8') == expected
56+
57+
2658
def test_huge_str():
2759
v = SchemaSerializer(core_schema.int_schema())
2860
msg = r"Expected `int` - serialized value may not be as expected \[input_value='123456789012345678901234...89012345678901234567890', input_type=str\]"

Diff for: tests/test_json.py

+2
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,8 @@ def test_to_json():
218218
assert to_json([1, 2]) == b'[1,2]'
219219
assert to_json([1, 2], indent=2) == b'[\n 1,\n 2\n]'
220220
assert to_json([1, b'x']) == b'[1,"x"]'
221+
assert to_json(['à', 'é']).decode('utf-8') == '["à","é"]'
222+
assert to_json(['à', 'é'], indent=2).decode('utf-8') == '[\n "à",\n "é"\n]'
221223

222224
# kwargs required
223225
with pytest.raises(TypeError, match=r'to_json\(\) takes 1 positional arguments but 2 were given'):

0 commit comments

Comments
 (0)