Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check iterable max_length as we validate #602

Closed
wants to merge 17 commits into from
Closed
58 changes: 57 additions & 1 deletion benches/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ extern crate test;
use test::{black_box, Bencher};

use pyo3::prelude::*;
use pyo3::types::{PyDict, PyString};
use pyo3::types::{PyDict, PySet, PyString};

use _pydantic_core::SchemaValidator;

Expand Down Expand Up @@ -265,6 +265,7 @@ fn dict_python(bench: &mut Bencher) {
.collect::<Vec<String>>()
.join(", ")
);
dbg!(code.clone());
let input = py.eval(&code, None, None).unwrap();
let input = black_box(input);
bench.iter(|| {
Expand Down Expand Up @@ -696,3 +697,58 @@ class Foo(Enum):
}
})
}

const COLLECTION_SIZE: usize = 100_000;

#[bench]
fn constructing_pyset_from_vec_without_capacity(bench: &mut Bencher) {
Python::with_gil(|py| {
let input: Vec<PyObject> = (0..COLLECTION_SIZE).map(|v| v.to_object(py)).collect();

bench.iter(|| {
black_box({
let mut output = Vec::new();
for x in &input {
output.push(x);
}
let set = PySet::new(py, output.iter()).unwrap();
set
})
})
})
}

#[bench]
fn constructing_pyset_from_vec_with_capacity(bench: &mut Bencher) {
Python::with_gil(|py| {
let input: Vec<PyObject> = (0..COLLECTION_SIZE).map(|v| v.to_object(py)).collect();

bench.iter(|| {
black_box({
let mut output = Vec::with_capacity(COLLECTION_SIZE);
for x in &input {
output.push(x);
}
let set = PySet::new(py, output.iter()).unwrap();
set
})
})
})
}

#[bench]
fn constructing_pyset_from_vec_directly(bench: &mut Bencher) {
Python::with_gil(|py| {
let input: Vec<PyObject> = (0..COLLECTION_SIZE).map(|v| v.to_object(py)).collect();

bench.iter(|| {
black_box({
let output = PySet::new(py, &Vec::<i64>::new()).unwrap();
for x in &input {
output.add(x).unwrap();
}
output
})
})
})
}
14 changes: 0 additions & 14 deletions pydantic_core/core_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1188,7 +1188,6 @@ class ListSchema(TypedDict, total=False):
min_length: int
max_length: int
strict: bool
allow_any_iter: bool
ref: str
metadata: Any
serialization: IncExSeqOrElseSerSchema
Expand All @@ -1200,7 +1199,6 @@ def list_schema(
min_length: int | None = None,
max_length: int | None = None,
strict: bool | None = None,
allow_any_iter: bool | None = None,
ref: str | None = None,
metadata: Any = None,
serialization: IncExSeqOrElseSerSchema | None = None,
Expand All @@ -1221,7 +1219,6 @@ def list_schema(
min_length: The value must be a list with at least this many items
max_length: The value must be a list with at most this many items
strict: The value must be a list with exactly this many items
allow_any_iter: Whether the value can be any iterable
ref: optional unique identifier of the schema, used to reference the schema in other places
metadata: Any other information you want to include with the schema, not used by pydantic-core
serialization: Custom serialization schema
Expand All @@ -1232,7 +1229,6 @@ def list_schema(
min_length=min_length,
max_length=max_length,
strict=strict,
allow_any_iter=allow_any_iter,
ref=ref,
metadata=metadata,
serialization=serialization,
Expand Down Expand Up @@ -1353,7 +1349,6 @@ class SetSchema(TypedDict, total=False):
items_schema: CoreSchema
min_length: int
max_length: int
generator_max_length: int
strict: bool
ref: str
metadata: Any
Expand All @@ -1365,7 +1360,6 @@ def set_schema(
*,
min_length: int | None = None,
max_length: int | None = None,
generator_max_length: int | None = None,
strict: bool | None = None,
ref: str | None = None,
metadata: Any = None,
Expand All @@ -1388,9 +1382,6 @@ def set_schema(
items_schema: The value must be a set with items that match this schema
min_length: The value must be a set with at least this many items
max_length: The value must be a set with at most this many items
generator_max_length: At most this many items will be read from a generator before failing validation
This is important because generators can be infinite, and even with a `max_length` on the set,
an infinite generator could run forever without producing more than `max_length` distinct items.
strict: The value must be a set with exactly this many items
ref: optional unique identifier of the schema, used to reference the schema in other places
metadata: Any other information you want to include with the schema, not used by pydantic-core
Expand All @@ -1401,7 +1392,6 @@ def set_schema(
items_schema=items_schema,
min_length=min_length,
max_length=max_length,
generator_max_length=generator_max_length,
strict=strict,
ref=ref,
metadata=metadata,
Expand All @@ -1414,7 +1404,6 @@ class FrozenSetSchema(TypedDict, total=False):
items_schema: CoreSchema
min_length: int
max_length: int
generator_max_length: int
strict: bool
ref: str
metadata: Any
Expand All @@ -1426,7 +1415,6 @@ def frozenset_schema(
*,
min_length: int | None = None,
max_length: int | None = None,
generator_max_length: int | None = None,
strict: bool | None = None,
ref: str | None = None,
metadata: Any = None,
Expand All @@ -1449,7 +1437,6 @@ def frozenset_schema(
items_schema: The value must be a frozenset with items that match this schema
min_length: The value must be a frozenset with at least this many items
max_length: The value must be a frozenset with at most this many items
generator_max_length: The value must generate a frozenset with at most this many items
strict: The value must be a frozenset with exactly this many items
ref: optional unique identifier of the schema, used to reference the schema in other places
metadata: Any other information you want to include with the schema, not used by pydantic-core
Expand All @@ -1460,7 +1447,6 @@ def frozenset_schema(
items_schema=items_schema,
min_length=min_length,
max_length=max_length,
generator_max_length=generator_max_length,
strict=strict,
ref=ref,
metadata=metadata,
Expand Down
210 changes: 210 additions & 0 deletions src/input/generic_iterable.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
use crate::errors::{py_err_string, ErrorType, ValError, ValResult};

use super::parse_json::{JsonInput, JsonObject};
use pyo3::{
exceptions::PyTypeError,
types::{
PyByteArray, PyBytes, PyDict, PyFrozenSet, PyIterator, PyList, PyMapping, PySequence, PySet, PyString, PyTuple,
},
PyAny, PyErr, PyResult, Python, ToPyObject,
};

#[derive(Debug)]
pub enum GenericIterable<'a> {
List(&'a PyList),
Tuple(&'a PyTuple),
Set(&'a PySet),
FrozenSet(&'a PyFrozenSet),
Dict(&'a PyDict),
// Treat dict values / keys / items as generic iterators
// since PyPy doesn't export the concrete types
DictKeys(&'a PyIterator),
DictValues(&'a PyIterator),
DictItems(&'a PyIterator),
Mapping(&'a PyMapping),
String(&'a PyString),
Bytes(&'a PyBytes),
PyByteArray(&'a PyByteArray),
Sequence(&'a PySequence),
Iterator(&'a PyIterator),
JsonArray(&'a [JsonInput]),
JsonObject(&'a JsonObject),
}

type PyMappingItems<'a> = (&'a PyAny, &'a PyAny);

#[inline(always)]
fn extract_items(item: PyResult<&PyAny>) -> PyResult<PyMappingItems<'_>> {
match item {
Ok(v) => v.extract::<PyMappingItems>(),
Err(e) => Err(e),
}
}

#[inline(always)]
fn map_err<'data>(py: Python<'data>, err: PyErr, input: &'data PyAny) -> ValError<'data> {
ValError::new(
ErrorType::IterationError {
error: py_err_string(py, err),
},
input,
)
}

impl<'a, 'py: 'a> GenericIterable<'a> {
pub fn len(&self) -> Option<usize> {
match &self {
GenericIterable::List(iter) => Some(iter.len()),
GenericIterable::Tuple(iter) => Some(iter.len()),
GenericIterable::Set(iter) => Some(iter.len()),
GenericIterable::FrozenSet(iter) => Some(iter.len()),
GenericIterable::Dict(iter) => Some(iter.len()),
GenericIterable::DictKeys(iter) => iter.len().ok(),
GenericIterable::DictValues(iter) => iter.len().ok(),
GenericIterable::DictItems(iter) => iter.len().ok(),
GenericIterable::Mapping(iter) => iter.len().ok(),
GenericIterable::String(iter) => iter.len().ok(),
GenericIterable::Bytes(iter) => iter.len().ok(),
GenericIterable::PyByteArray(iter) => Some(iter.len()),
GenericIterable::Sequence(iter) => iter.len().ok(),
GenericIterable::Iterator(iter) => iter.len().ok(),
GenericIterable::JsonArray(iter) => Some(iter.len()),
GenericIterable::JsonObject(iter) => Some(iter.len()),
}
}
pub fn into_sequence_iterator(
self,
py: Python<'py>,
) -> PyResult<Box<dyn Iterator<Item = ValResult<'a, &'a PyAny>> + 'a>> {
match self {
GenericIterable::List(iter) => Ok(Box::new(iter.iter().map(Ok))),
GenericIterable::Tuple(iter) => Ok(Box::new(iter.iter().map(Ok))),
GenericIterable::Set(iter) => Ok(Box::new(iter.iter().map(Ok))),
GenericIterable::FrozenSet(iter) => Ok(Box::new(iter.iter().map(Ok))),
// Note that this iterates over only the keys, just like doing iter({}) in Python
GenericIterable::Dict(iter) => Ok(Box::new(iter.iter().map(|(k, _)| Ok(k)))),
GenericIterable::DictKeys(iter) => Ok(Box::new(
iter.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::DictValues(iter) => Ok(Box::new(
iter.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::DictItems(iter) => Ok(Box::new(
iter.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
// Note that this iterates over only the keys, just like doing iter({}) in Python
GenericIterable::Mapping(iter) => Ok(Box::new(
iter.keys()?
.iter()?
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::String(iter) => Ok(Box::new(
iter.iter()?.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::Bytes(iter) => Ok(Box::new(
iter.iter()?.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::PyByteArray(iter) => Ok(Box::new(
iter.iter()?.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::Sequence(iter) => Ok(Box::new(
iter.iter()?.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::Iterator(iter) => Ok(Box::new(
iter.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::JsonArray(iter) => Ok(Box::new(iter.iter().map(move |v| {
let v = v.to_object(py);
Ok(v.into_ref(py))
}))),
// Note that this iterates over only the keys, just like doing iter({}) in Python, just for consistency
GenericIterable::JsonObject(iter) => Ok(Box::new(
iter.iter().map(move |(k, _)| Ok(k.to_object(py).into_ref(py))),
)),
}
}

pub fn into_mapping_items_iterator(
self,
py: Python<'a>,
) -> PyResult<Box<dyn Iterator<Item = ValResult<'a, PyMappingItems<'a>>> + 'a>> {
match self {
GenericIterable::List(iter) => {
Ok(Box::new(iter.iter().map(move |v| {
extract_items(Ok(v)).map_err(|e| map_err(py, e, iter.as_ref()))
})))
}
GenericIterable::Tuple(iter) => {
Ok(Box::new(iter.iter().map(move |v| {
extract_items(Ok(v)).map_err(|e| map_err(py, e, iter.as_ref()))
})))
}
GenericIterable::Set(iter) => {
Ok(Box::new(iter.iter().map(move |v| {
extract_items(Ok(v)).map_err(|e| map_err(py, e, iter.as_ref()))
})))
}
GenericIterable::FrozenSet(iter) => {
Ok(Box::new(iter.iter().map(move |v| {
extract_items(Ok(v)).map_err(|e| map_err(py, e, iter.as_ref()))
})))
}
// Note that we iterate over (key, value), unlike doing iter({}) in Python
GenericIterable::Dict(iter) => Ok(Box::new(iter.iter().map(Ok))),
// Keys or values can be tuples
GenericIterable::DictKeys(iter) => Ok(Box::new(
iter.map(extract_items)
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::DictValues(iter) => Ok(Box::new(
iter.map(extract_items)
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::DictItems(iter) => Ok(Box::new(
iter.map(extract_items)
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
// Note that we iterate over (key, value), unlike doing iter({}) in Python
GenericIterable::Mapping(iter) => Ok(Box::new(
iter.items()?
.iter()?
.map(extract_items)
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
// In Python if you do dict("foobar") you get "dictionary update sequence element #0 has length 1; 2 is required"
// This is similar but arguably a better error message
GenericIterable::String(_) => Err(PyTypeError::new_err(
"Expected an iterable of (key, value) pairs, got a string",
)),
GenericIterable::Bytes(_) => Err(PyTypeError::new_err(
"Expected an iterable of (key, value) pairs, got a bytes",
)),
GenericIterable::PyByteArray(_) => Err(PyTypeError::new_err(
"Expected an iterable of (key, value) pairs, got a bytearray",
)),
// Obviously these may be things that are not convertible to a tuple of (Hashable, Any)
// Python fails with a similar error message to above, ours will be slightly different (PyO3 will fail to extract) but similar enough
GenericIterable::Sequence(iter) => Ok(Box::new(
iter.iter()?
.map(extract_items)
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::Iterator(iter) => Ok(Box::new(
iter.iter()?
.map(extract_items)
.map(move |r| r.map_err(|e| map_err(py, e, iter.as_ref()))),
)),
GenericIterable::JsonArray(iter) => Ok(Box::new(
iter.iter()
.map(move |v| extract_items(Ok(v.to_object(py).into_ref(py))))
.map(move |r| r.map_err(|e| map_err(py, e, iter.to_object(py).into_ref(py)))),
)),
// Note that we iterate over (key, value), unlike doing iter({}) in Python
GenericIterable::JsonObject(iter) => Ok(Box::new(iter.iter().map(move |(k, v)| {
let k = PyString::new(py, k).as_ref();
let v = v.to_object(py).into_ref(py);
Ok((k, v))
}))),
}
}
}
Loading