diff --git a/Cargo.toml b/Cargo.toml index e4f1780d2914..a80fb76e695f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,8 @@ members = [ "arrow-integration-testing", "arrow-ipc", "arrow-json", + "arrow-memory-size", + "arrow-memory-size-derive", "arrow-ord", "arrow-pyarrow", "arrow-row", @@ -94,6 +96,8 @@ arrow-csv = { version = "57.2.0", path = "./arrow-csv" } arrow-data = { version = "57.2.0", path = "./arrow-data" } arrow-ipc = { version = "57.2.0", path = "./arrow-ipc" } arrow-json = { version = "57.2.0", path = "./arrow-json" } +arrow-memory-size = { version = "57.2.0", path = "./arrow-memory-size" } +arrow-memory-size-derive = { version = "57.2.0", path = "./arrow-memory-size-derive" } arrow-ord = { version = "57.2.0", path = "./arrow-ord" } arrow-pyarrow = { version = "57.2.0", path = "./arrow-pyarrow" } arrow-row = { version = "57.2.0", path = "./arrow-row" } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 0a1d6fde916c..077a3553a830 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -40,6 +40,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] [dependencies] arrow-buffer = { workspace = true } +arrow-memory-size = { workspace = true } arrow-schema = { workspace = true } arrow-data = { workspace = true } chrono = { workspace = true } diff --git a/arrow-array/src/heap_size.rs b/arrow-array/src/heap_size.rs new file mode 100644 index 000000000000..cdbea2233176 --- /dev/null +++ b/arrow-array/src/heap_size.rs @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`HeapSize`] implementations for arrow-array types + +use arrow_memory_size::HeapSize; + +use crate::Array; +use crate::types::{ArrowDictionaryKeyType, ArrowPrimitiveType, RunEndIndexType}; +use crate::{ + BinaryArray, BinaryViewArray, BooleanArray, DictionaryArray, FixedSizeBinaryArray, + FixedSizeListArray, LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, + ListArray, ListViewArray, MapArray, NullArray, PrimitiveArray, RunArray, StringArray, + StringViewArray, StructArray, UnionArray, +}; + +// Note: A blanket implementation `impl HeapSize for T` would be ideal, +// but is not possible due to Rust's orphan rules (E0210) since HeapSize is defined +// in a separate crate. +// +// Note: HeapSize cannot be implemented for ArrayRef (Arc) here due to +// Rust's orphan rules. Use array.get_buffer_memory_size() directly instead. + +/// Implements HeapSize for array types that delegate to get_buffer_memory_size() +macro_rules! impl_heap_size { + ($($ty:ty),*) => { + $( + impl HeapSize for $ty { + fn heap_size(&self) -> usize { + self.get_buffer_memory_size() + } + } + )* + }; +} + +impl_heap_size!( + BooleanArray, + NullArray, + StringArray, + LargeStringArray, + BinaryArray, + LargeBinaryArray, + StringViewArray, + BinaryViewArray, + FixedSizeBinaryArray, + ListArray, + LargeListArray, + ListViewArray, + LargeListViewArray, + FixedSizeListArray, + StructArray, + MapArray, + UnionArray +); + +impl HeapSize for PrimitiveArray { + fn heap_size(&self) -> usize { + self.get_buffer_memory_size() + } +} + +impl HeapSize for DictionaryArray { + fn heap_size(&self) -> usize { + self.get_buffer_memory_size() + } +} + +impl HeapSize for RunArray { + fn heap_size(&self) -> usize { + self.get_buffer_memory_size() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Int32Array; + + #[test] + fn test_primitive_array_heap_size() { + let array = Int32Array::from(vec![1, 2, 3, 4, 5]); + // HeapSize should mirror the Array memory size APIs + assert_eq!(array.heap_size(), array.get_buffer_memory_size()); + assert_eq!(array.total_size(), array.get_array_memory_size()); + } + + #[test] + fn test_string_array_heap_size() { + let array = StringArray::from(vec!["hello", "world"]); + // Buffer capacities depend on allocator alignment and may vary by platform + assert_eq!(array.heap_size(), array.get_buffer_memory_size()); + assert_eq!(array.total_size(), array.get_array_memory_size()); + } + + #[test] + fn test_boolean_array_heap_size() { + let array = BooleanArray::from(vec![true, false, true]); + // Packed bits make heap_size small, but struct size is platform-dependent + assert_eq!(array.heap_size(), array.get_buffer_memory_size()); + assert_eq!(array.total_size(), array.get_array_memory_size()); + } + + #[test] + fn test_null_array_heap_size() { + let array = NullArray::new(100); + // NullArray has no buffers; total size still depends on struct layout + assert_eq!(array.heap_size(), array.get_buffer_memory_size()); + assert_eq!(array.total_size(), array.get_array_memory_size()); + } + + #[test] + fn test_struct_array_heap_size() { + use crate::builder::StructBuilder; + use arrow_schema::{DataType, Field, Fields}; + + let fields = Fields::from(vec![Field::new("a", DataType::Int32, false)]); + let mut builder = StructBuilder::from_fields(fields, 10); + builder + .field_builder::(0) + .unwrap() + .append_value(1); + builder.append(true); + let array = builder.finish(); + // StructArray aggregates child buffers; sizes depend on allocator and layout + assert_eq!(array.heap_size(), array.get_buffer_memory_size()); + assert_eq!(array.total_size(), array.get_array_memory_size()); + } +} diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index a5f9bf5e711c..70054e73949d 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -262,6 +262,8 @@ pub mod timezone; mod trusted_len; pub mod types; +mod heap_size; + #[cfg(test)] mod tests { use crate::builder::*; diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 02ea49c37c46..4507bb3227ca 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -39,6 +39,7 @@ all-features = true pool = [] [dependencies] +arrow-memory-size = { workspace = true } bytes = { version = "1.4" } num-bigint = { version = "0.4.6", default-features = false, features = ["std"] } num-traits = { version = "0.2.19", default-features = false, features = ["std"] } diff --git a/arrow-buffer/src/heap_size.rs b/arrow-buffer/src/heap_size.rs new file mode 100644 index 000000000000..8dca6e42086b --- /dev/null +++ b/arrow-buffer/src/heap_size.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`HeapSize`] implementations for arrow-buffer types + +use arrow_memory_size::HeapSize; + +use crate::{ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; + +impl HeapSize for Buffer { + fn heap_size(&self) -> usize { + self.capacity() + } +} + +impl HeapSize for ScalarBuffer { + fn heap_size(&self) -> usize { + self.inner().capacity() + } +} + +impl HeapSize for OffsetBuffer { + fn heap_size(&self) -> usize { + self.inner().inner().capacity() + } +} + +impl HeapSize for NullBuffer { + fn heap_size(&self) -> usize { + self.buffer().capacity() + } +} + +impl HeapSize for BooleanBuffer { + fn heap_size(&self) -> usize { + self.inner().capacity() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_buffer_heap_size() { + let buf = Buffer::from(vec![1u8, 2, 3, 4, 5]); + assert!(buf.heap_size() >= 5); + } + + #[test] + fn test_scalar_buffer_heap_size() { + let buf: ScalarBuffer = vec![1, 2, 3, 4, 5].into(); + assert!(buf.heap_size() >= 5 * std::mem::size_of::()); + } + + #[test] + fn test_null_buffer_heap_size() { + let buf = NullBuffer::new_null(100); + assert!(buf.heap_size() > 0); + } + + #[test] + fn test_boolean_buffer_heap_size() { + let buf = BooleanBuffer::new_set(100); + assert!(buf.heap_size() > 0); + } +} diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 230747b8b84a..dcd5950d1681 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -62,6 +62,8 @@ pub use interval::*; mod arith; +mod heap_size; + #[cfg(feature = "pool")] mod pool; #[cfg(feature = "pool")] diff --git a/arrow-memory-size-derive/Cargo.toml b/arrow-memory-size-derive/Cargo.toml new file mode 100644 index 000000000000..d6bb78282c29 --- /dev/null +++ b/arrow-memory-size-derive/Cargo.toml @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-memory-size-derive" +version = { workspace = true } +description = "Derive macro for HeapSize trait" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lib] +proc-macro = true + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +proc-macro2 = { version = "1.0", default-features = false } +quote = { version = "1.0", default-features = false } +syn = { version = "2.0", features = ["extra-traits"] } + +[dev-dependencies] +arrow-memory-size = { workspace = true } diff --git a/arrow-memory-size-derive/LICENSE.txt b/arrow-memory-size-derive/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-memory-size-derive/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-memory-size-derive/NOTICE.txt b/arrow-memory-size-derive/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-memory-size-derive/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-memory-size-derive/README.md b/arrow-memory-size-derive/README.md new file mode 100644 index 000000000000..f60128bef762 --- /dev/null +++ b/arrow-memory-size-derive/README.md @@ -0,0 +1,72 @@ + + +# `arrow-memory-size-derive` + +[![crates.io](https://img.shields.io/crates/v/arrow-memory-size-derive.svg)](https://crates.io/crates/arrow-memory-size-derive) +[![docs.rs](https://img.shields.io/docsrs/arrow-memory-size-derive.svg)](https://docs.rs/arrow-memory-size-derive/latest/arrow_memory_size_derive/) + +Derive macro for the `HeapSize` trait from [`arrow-memory-size`]. + +[`arrow-memory-size`]: https://crates.io/crates/arrow-memory-size + +--- + +## Install + +```toml +[dependencies] +arrow-memory-size = "57.0.0" +arrow-memory-size-derive = "57.0.0" +``` + +--- + +## Usage + +```rust +use arrow_memory_size::HeapSize; +use arrow_memory_size_derive::HeapSize; + +#[derive(HeapSize)] +struct MyStruct { + name: String, + data: Vec, +} + +let s = MyStruct { + name: "test".to_string(), + data: vec![1, 2, 3], +}; +println!("Heap size: {} bytes", s.heap_size()); +``` + +## Field Attributes + +- `#[heap_size(ignore)]` — Skip this field (contributes 0 to heap size) +- `#[heap_size(size = N)]` — Use a constant value N +- `#[heap_size(size_fn = path)]` — Call a custom function `fn(&FieldType) -> usize` + +See the [`arrow-memory-size` README](../arrow-memory-size/README.md) for full documentation and examples. + +--- + +## License + +Licensed under the Apache License, Version 2.0. diff --git a/arrow-memory-size-derive/src/lib.rs b/arrow-memory-size-derive/src/lib.rs new file mode 100644 index 000000000000..747e645d3208 --- /dev/null +++ b/arrow-memory-size-derive/src/lib.rs @@ -0,0 +1,457 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Derive macro for the [`HeapSize`] trait. +//! +//! This crate provides a `#[derive(HeapSize)]` macro that automatically +//! implements the `HeapSize` trait for structs and enums. +//! +//! # Example +//! +//! ```rust,ignore +//! use arrow_memory_size::HeapSize; +//! use arrow_memory_size_derive::HeapSize; +//! +//! #[derive(HeapSize)] +//! struct MyStruct { +//! name: String, +//! data: Vec, +//! count: i32, +//! } +//! ``` +//! +//! # Field Attributes +//! +//! The derive macro supports several attributes to customize behavior: +//! +//! ## `#[heap_size(ignore)]` +//! +//! Skip this field entirely (contributes 0 to heap size). +//! +//! ```rust,ignore +//! #[derive(HeapSize)] +//! struct MyStruct { +//! data: Vec, +//! #[heap_size(ignore)] +//! cached_hash: u64, // Not counted +//! } +//! ``` +//! +//! ## `#[heap_size(size = N)]` +//! +//! Use a constant value instead of calling `heap_size()`. +//! +//! ```rust,ignore +//! #[derive(HeapSize)] +//! struct MyStruct { +//! #[heap_size(size = 1024)] +//! fixed_buffer: *const u8, // Known to be 1KB +//! } +//! ``` +//! +//! ## `#[heap_size(size_fn = path)]` +//! +//! Call a custom function to compute the heap size. +//! The function must have signature `fn(&FieldType) -> usize`. +//! +//! ```rust,ignore +//! fn custom_size(data: &ExternalType) -> usize { +//! data.len() * 8 +//! } +//! +//! #[derive(HeapSize)] +//! struct MyStruct { +//! #[heap_size(size_fn = custom_size)] +//! external: ExternalType, +//! } +//! ``` +//! +//! # Restrictions +//! +//! This macro will emit a compile error if any field contains `Arc` or `Rc` +//! types (unless the field is ignored), as the semantics for shared references +//! are complex and should be handled manually. + +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main/docs/source/_static/images/Arrow-logo_hex_black-txt_transparent-bg.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main/docs/source/_static/images/Arrow-logo_hex_black-txt_transparent-bg.svg" +)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![warn(missing_docs)] + +extern crate proc_macro; + +use proc_macro2::TokenStream; +use quote::quote; +use syn::{ + Data, DataEnum, DataStruct, DeriveInput, Expr, Fields, GenericParam, Lit, Path, Type, + parse_macro_input, +}; + +/// Field attribute configuration parsed from `#[heap_size(...)]` +#[derive(Default)] +struct FieldAttr { + /// Skip this field (return 0) + ignore: bool, + /// Use a constant size value + size: Option, + /// Use a custom function to compute size + size_fn: Option, +} + +impl FieldAttr { + fn parse(field: &syn::Field) -> Result { + let mut attr = FieldAttr::default(); + + for a in &field.attrs { + if !a.path().is_ident("heap_size") { + continue; + } + + a.parse_nested_meta(|meta| { + if meta.path.is_ident("ignore") { + attr.ignore = true; + Ok(()) + } else if meta.path.is_ident("size") { + let value: Expr = meta.value()?.parse()?; + if let Expr::Lit(expr_lit) = &value { + if let Lit::Int(lit_int) = &expr_lit.lit { + attr.size = Some(lit_int.base10_parse()?); + return Ok(()); + } + } + Err(meta.error("expected integer literal for `size`")) + } else if meta.path.is_ident("size_fn") { + let value: Expr = meta.value()?.parse()?; + if let Expr::Path(expr_path) = value { + attr.size_fn = Some(expr_path.path); + return Ok(()); + } + Err(meta.error("expected path for `size_fn`")) + } else { + Err(meta.error("unknown heap_size attribute")) + } + })?; + } + + // Validate that only one option is set + let count = attr.ignore as u8 + attr.size.is_some() as u8 + attr.size_fn.is_some() as u8; + if count > 1 { + return Err(syn::Error::new_spanned( + field, + "only one of `ignore`, `size`, or `size_fn` can be specified", + )); + } + + Ok(attr) + } +} + +/// Derive [`HeapSize`] implementations for structs and enums. +/// +/// This macro generates an implementation of the `HeapSize` trait that +/// calculates heap memory usage by summing the `heap_size()` of all fields. +/// +/// # Supported Types +/// +/// - **Structs with named fields**: sums heap size of all fields +/// - **Tuple structs**: sums heap size of all tuple elements +/// - **Unit structs**: returns 0 +/// - **Enums**: matches on variants and sums heap size of variant fields +/// +/// # Field Attributes +/// +/// - `#[heap_size(ignore)]` - Skip this field (contributes 0) +/// - `#[heap_size(size = N)]` - Use constant value N +/// - `#[heap_size(size_fn = path)]` - Call custom function +/// +/// # Restrictions +/// +/// This macro will emit a compile error if any field contains `Arc` or `Rc` +/// types (unless ignored), as the semantics for shared references are complex +/// and should be handled manually. +/// +/// # Example +/// +/// ```rust,ignore +/// use arrow_memory_size::HeapSize; +/// use arrow_memory_size_derive::HeapSize; +/// +/// #[derive(HeapSize)] +/// struct MyStruct { +/// name: String, +/// data: Vec, +/// #[heap_size(ignore)] +/// cached: u64, +/// } +/// +/// let s = MyStruct { +/// name: "test".to_string(), +/// data: vec![1, 2, 3], +/// cached: 0, +/// }; +/// println!("Heap size: {} bytes", s.heap_size()); +/// ``` +#[proc_macro_derive(HeapSize, attributes(heap_size))] +pub fn heap_size_derive(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + let input: DeriveInput = parse_macro_input!(input as DeriveInput); + let name = &input.ident; + + // Check for Arc/Rc in non-ignored fields and emit error if found + if let Err(err) = check_no_arc_rc(&input.data) { + return err.to_compile_error().into(); + } + + // Build the generics with HeapSize bounds + let generics = add_heap_size_bounds(&input.generics); + let (impl_generics, ty_generics, where_clause) = generics.split_for_impl(); + + let heap_size_body = match &input.data { + Data::Struct(data) => match generate_struct_heap_size(data) { + Ok(body) => body, + Err(err) => return err.to_compile_error().into(), + }, + Data::Enum(data) => match generate_enum_heap_size(data) { + Ok(body) => body, + Err(err) => return err.to_compile_error().into(), + }, + Data::Union(_) => { + return syn::Error::new_spanned(&input, "HeapSize cannot be derived for unions") + .to_compile_error() + .into(); + } + }; + + let expanded = quote! { + impl #impl_generics ::arrow_memory_size::HeapSize for #name #ty_generics #where_clause { + fn heap_size(&self) -> usize { + #heap_size_body + } + } + }; + + expanded.into() +} + +/// Check that no non-ignored fields contain Arc or Rc types +fn check_no_arc_rc(data: &Data) -> Result<(), syn::Error> { + match data { + Data::Struct(data) => check_fields_no_arc_rc(&data.fields), + Data::Enum(data) => { + for variant in &data.variants { + check_fields_no_arc_rc(&variant.fields)?; + } + Ok(()) + } + Data::Union(_) => Ok(()), + } +} + +/// Check that non-ignored fields don't contain Arc/Rc +fn check_fields_no_arc_rc(fields: &Fields) -> Result<(), syn::Error> { + for field in fields { + // Parse attributes to check if field is ignored + let attr = FieldAttr::parse(field)?; + if attr.ignore { + continue; // Skip Arc/Rc check for ignored fields + } + + if contains_arc_or_rc(&field.ty) { + return Err(syn::Error::new_spanned( + &field.ty, + "HeapSize cannot be derived for types containing Arc or Rc. \ + Use #[heap_size(ignore)] to skip this field, or implement HeapSize manually.", + )); + } + } + Ok(()) +} + +/// Recursively check if a type contains Arc or Rc +fn contains_arc_or_rc(ty: &Type) -> bool { + match ty { + Type::Path(type_path) => { + // Check the last segment of the path for Arc or Rc + if let Some(segment) = type_path.path.segments.last() { + let ident = segment.ident.to_string(); + if ident == "Arc" || ident == "Rc" { + return true; + } + // Check generic arguments recursively + if let syn::PathArguments::AngleBracketed(args) = &segment.arguments { + for arg in &args.args { + if let syn::GenericArgument::Type(inner_ty) = arg { + if contains_arc_or_rc(inner_ty) { + return true; + } + } + } + } + } + // Check all segments for fully qualified paths like std::sync::Arc + for segment in &type_path.path.segments { + let ident = segment.ident.to_string(); + if ident == "Arc" || ident == "Rc" { + return true; + } + } + false + } + Type::Tuple(tuple) => tuple.elems.iter().any(contains_arc_or_rc), + Type::Array(array) => contains_arc_or_rc(&array.elem), + Type::Slice(slice) => contains_arc_or_rc(&slice.elem), + Type::Reference(reference) => contains_arc_or_rc(&reference.elem), + Type::Paren(paren) => contains_arc_or_rc(&paren.elem), + Type::Group(group) => contains_arc_or_rc(&group.elem), + _ => false, + } +} + +/// Add HeapSize bounds to generic parameters +fn add_heap_size_bounds(generics: &syn::Generics) -> syn::Generics { + let mut generics = generics.clone(); + for param in &mut generics.params { + if let GenericParam::Type(type_param) = param { + type_param + .bounds + .push(syn::parse_quote!(::arrow_memory_size::HeapSize)); + } + } + generics +} + +/// Generate the size expression for a single field +fn generate_field_size_expr( + field: &syn::Field, + accessor: TokenStream, +) -> Result { + let attr = FieldAttr::parse(field)?; + + if attr.ignore { + return Ok(quote! { 0 }); + } + + if let Some(size) = attr.size { + return Ok(quote! { #size }); + } + + if let Some(size_fn) = attr.size_fn { + return Ok(quote! { #size_fn(&#accessor) }); + } + + // Default: call heap_size() + Ok(quote! { ::arrow_memory_size::HeapSize::heap_size(&#accessor) }) +} + +/// Generate heap_size() body for structs +fn generate_struct_heap_size(data: &DataStruct) -> Result { + match &data.fields { + Fields::Named(fields) => { + if fields.named.is_empty() { + Ok(quote! { 0 }) + } else { + let mut field_sizes = Vec::new(); + for f in &fields.named { + let name = &f.ident; + let accessor = quote! { self.#name }; + field_sizes.push(generate_field_size_expr(f, accessor)?); + } + Ok(quote! { #(#field_sizes)+* }) + } + } + Fields::Unnamed(fields) => { + if fields.unnamed.is_empty() { + Ok(quote! { 0 }) + } else { + let mut field_sizes = Vec::new(); + for (i, f) in fields.unnamed.iter().enumerate() { + let index = syn::Index::from(i); + let accessor = quote! { self.#index }; + field_sizes.push(generate_field_size_expr(f, accessor)?); + } + Ok(quote! { #(#field_sizes)+* }) + } + } + Fields::Unit => Ok(quote! { 0 }), + } +} + +/// Generate heap_size() body for enums +fn generate_enum_heap_size(data: &DataEnum) -> Result { + if data.variants.is_empty() { + return Ok(quote! { 0 }); + } + + let mut match_arms = Vec::new(); + + for variant in &data.variants { + let variant_name = &variant.ident; + let arm = match &variant.fields { + Fields::Named(fields) => { + let field_names: Vec<_> = fields + .named + .iter() + .map(|f| f.ident.as_ref().unwrap()) + .collect(); + if field_names.is_empty() { + quote! { Self::#variant_name {} => 0 } + } else { + let mut field_sizes = Vec::new(); + for f in &fields.named { + let name = f.ident.as_ref().unwrap(); + let accessor = quote! { *#name }; + field_sizes.push(generate_field_size_expr(f, accessor)?); + } + quote! { + Self::#variant_name { #(#field_names),* } => { + #(#field_sizes)+* + } + } + } + } + Fields::Unnamed(fields) => { + let field_names: Vec<_> = (0..fields.unnamed.len()) + .map(|i| syn::Ident::new(&format!("f{}", i), proc_macro2::Span::call_site())) + .collect(); + if field_names.is_empty() { + quote! { Self::#variant_name() => 0 } + } else { + let mut field_sizes = Vec::new(); + for (i, f) in fields.unnamed.iter().enumerate() { + let name = + syn::Ident::new(&format!("f{}", i), proc_macro2::Span::call_site()); + let accessor = quote! { *#name }; + field_sizes.push(generate_field_size_expr(f, accessor)?); + } + quote! { + Self::#variant_name(#(#field_names),*) => { + #(#field_sizes)+* + } + } + } + } + Fields::Unit => quote! { Self::#variant_name => 0 }, + }; + match_arms.push(arm); + } + + Ok(quote! { + match self { + #(#match_arms),* + } + }) +} diff --git a/arrow-memory-size-derive/tests/derive_tests.rs b/arrow-memory-size-derive/tests/derive_tests.rs new file mode 100644 index 000000000000..6bf8eefe2477 --- /dev/null +++ b/arrow-memory-size-derive/tests/derive_tests.rs @@ -0,0 +1,355 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for the HeapSize derive macro + +use arrow_memory_size::HeapSize; +use arrow_memory_size_derive::HeapSize; + +// ============================================================================= +// Basic derive test structures +// ============================================================================= + +/// Test struct with named fields +#[derive(HeapSize)] +struct HeapSizeNamedFields { + name: String, + data: Vec, + count: i32, + optional: Option, +} + +/// Test tuple struct +#[derive(HeapSize)] +struct HeapSizeTuple(String, Vec, i32); + +/// Test unit struct +#[derive(HeapSize)] +struct HeapSizeUnit; + +/// Test empty struct with named fields +#[derive(HeapSize)] +struct HeapSizeEmpty {} + +/// Test empty tuple struct +#[derive(HeapSize)] +struct HeapSizeEmptyTuple(); + +/// Test enum with various variant types +#[derive(HeapSize)] +enum HeapSizeEnum { + Unit, + Tuple(String, Vec), + Named { name: String, value: i32 }, +} + +/// Test generic struct +#[derive(HeapSize)] +struct HeapSizeGeneric { + value: T, + items: Vec, +} + +/// Test struct with Box +#[derive(HeapSize)] +struct HeapSizeWithBox { + boxed: Box, +} + +/// Test struct with nested containers +#[derive(HeapSize)] +struct HeapSizeNested { + data: Vec>, + map: std::collections::HashMap>, +} + +// ============================================================================= +// Basic derive tests +// ============================================================================= + +#[test] +fn test_heap_size_named_fields() { + let s = HeapSizeNamedFields { + name: "hello".to_string(), + data: vec![1, 2, 3, 4, 5], + count: 42, + optional: Some("world".to_string()), + }; + + let size = s.heap_size(); + // Should include: String capacity + Vec capacity + Option capacity + // "hello" = 5 bytes, vec = 5 bytes, "world" = 5 bytes + assert!(size >= 15, "heap_size should be at least 15, got {}", size); +} + +#[test] +fn test_heap_size_tuple_struct() { + let s = HeapSizeTuple("test".to_string(), vec![1, 2, 3], 0); + let size = s.heap_size(); + // "test" = 4 bytes + vec = 3 bytes + assert!(size >= 7, "heap_size should be at least 7, got {}", size); +} + +#[test] +fn test_heap_size_unit_struct() { + let s = HeapSizeUnit; + assert_eq!(s.heap_size(), 0); +} + +#[test] +fn test_heap_size_empty_struct() { + let s = HeapSizeEmpty {}; + assert_eq!(s.heap_size(), 0); +} + +#[test] +fn test_heap_size_empty_tuple() { + let s = HeapSizeEmptyTuple(); + assert_eq!(s.heap_size(), 0); +} + +#[test] +fn test_heap_size_enum_unit() { + let e = HeapSizeEnum::Unit; + assert_eq!(e.heap_size(), 0); +} + +#[test] +fn test_heap_size_enum_tuple() { + let e = HeapSizeEnum::Tuple("hello".to_string(), vec![1, 2, 3]); + let size = e.heap_size(); + assert!(size >= 8, "heap_size should be at least 8, got {}", size); +} + +#[test] +fn test_heap_size_enum_named() { + let e = HeapSizeEnum::Named { + name: "test".to_string(), + value: 42, + }; + let size = e.heap_size(); + // "test" = 4 bytes, i32 = 0 bytes on heap + assert!(size >= 4, "heap_size should be at least 4, got {}", size); +} + +#[test] +fn test_heap_size_generic() { + let s = HeapSizeGeneric { + value: "hello".to_string(), + items: vec!["a".to_string(), "bb".to_string(), "ccc".to_string()], + }; + let size = s.heap_size(); + // "hello" = 5, vec has 3 strings with capacity for 3 + "a"(1) + "bb"(2) + "ccc"(3) + assert!(size >= 11, "heap_size should be at least 11, got {}", size); +} + +#[test] +fn test_heap_size_with_box() { + let s = HeapSizeWithBox { + boxed: Box::new("hello".to_string()), + }; + let size = s.heap_size(); + // Box overhead (size of String) + String heap allocation + assert!(size > 0, "heap_size should be > 0, got {}", size); +} + +#[test] +fn test_heap_size_nested() { + let s = HeapSizeNested { + data: vec![ + vec!["a".to_string()], + vec!["b".to_string(), "c".to_string()], + ], + map: std::collections::HashMap::new(), + }; + let size = s.heap_size(); + assert!(size > 0, "heap_size should be > 0, got {}", size); +} + +#[test] +fn test_total_size() { + let s = HeapSizeUnit; + let total = s.total_size(); + assert_eq!(total, std::mem::size_of::()); +} + +// ============================================================================= +// Derive macro attribute tests +// ============================================================================= + +/// Test #[heap_size(ignore)] attribute +#[derive(HeapSize)] +struct WithIgnore { + data: String, + #[heap_size(ignore)] + _ignored: Vec, +} + +#[test] +fn test_heap_size_ignore_attribute() { + let s = WithIgnore { + data: "hello".to_string(), + _ignored: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], // 10 bytes, but ignored + }; + let size = s.heap_size(); + // Should only count the String, not the Vec + assert!( + size >= 5, + "heap_size should be at least 5 for 'hello', got {}", + size + ); + // Should be less than if we counted the Vec too + assert!( + size < 20, + "heap_size should not include ignored Vec, got {}", + size + ); +} + +/// Test #[heap_size(size = N)] attribute +#[derive(HeapSize)] +struct WithConstantSize { + data: String, + #[heap_size(size = 1024)] + fixed: u64, // primitives normally have 0 heap size +} + +#[test] +fn test_heap_size_constant_attribute() { + let s = WithConstantSize { + data: "hello".to_string(), + fixed: 42, + }; + let size = s.heap_size(); + // Should include 5 bytes for "hello" + 1024 constant + assert!( + size >= 1029, + "heap_size should be at least 1029, got {}", + size + ); +} + +/// Custom function for size_fn attribute test +fn custom_size_fn(v: &Vec) -> usize { + v.len() * 100 // Deliberately different from actual size +} + +/// Test #[heap_size(size_fn = path)] attribute +#[derive(HeapSize)] +struct WithSizeFn { + data: String, + #[heap_size(size_fn = custom_size_fn)] + custom: Vec, +} + +#[test] +fn test_heap_size_size_fn_attribute() { + let s = WithSizeFn { + data: "hello".to_string(), + custom: vec![1, 2, 3], // 3 elements * 100 = 300 + }; + let size = s.heap_size(); + // Should include 5 bytes for "hello" + 300 from custom_size_fn + assert!( + size >= 305, + "heap_size should be at least 305, got {}", + size + ); +} + +/// Test #[heap_size(ignore)] allows Arc fields +#[derive(HeapSize)] +struct WithIgnoredArc { + data: String, + #[heap_size(ignore)] + shared: std::sync::Arc, +} + +#[test] +fn test_heap_size_ignored_arc() { + let s = WithIgnoredArc { + data: "hello".to_string(), + shared: std::sync::Arc::new("world".to_string()), + }; + let size = s.heap_size(); + // Should only count the data String, Arc is ignored + assert!(size >= 5, "heap_size should be at least 5, got {}", size); +} + +/// Test enum with attributes +#[derive(HeapSize)] +#[allow(dead_code)] +enum EnumWithAttributes { + Normal(String), + WithIgnored { + data: String, + #[heap_size(ignore)] + ignored: Vec, + }, + WithConstant { + #[heap_size(size = 500)] + fixed: u8, + }, +} + +#[test] +fn test_enum_with_ignore_attribute() { + let e = EnumWithAttributes::WithIgnored { + data: "test".to_string(), + ignored: vec![1, 2, 3, 4, 5], + }; + let size = e.heap_size(); + // Should only count "test" (4 bytes) + assert!(size >= 4, "heap_size should be at least 4, got {}", size); + assert!( + size < 15, + "heap_size should not include ignored Vec, got {}", + size + ); +} + +#[test] +fn test_enum_with_constant_attribute() { + let e = EnumWithAttributes::WithConstant { fixed: 42 }; + let size = e.heap_size(); + assert_eq!(size, 500, "heap_size should be exactly 500, got {}", size); +} + +/// Test tuple struct with attributes +#[derive(HeapSize)] +struct TupleWithAttributes( + String, + #[heap_size(ignore)] Vec, + #[heap_size(size = 200)] u32, +); + +#[test] +fn test_tuple_struct_with_attributes() { + let s = TupleWithAttributes("hello".to_string(), vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 42); + let size = s.heap_size(); + // Should be 5 (string) + 0 (ignored) + 200 (constant) = 205 + assert!( + size >= 205, + "heap_size should be at least 205, got {}", + size + ); + assert!( + size < 220, + "heap_size should not include ignored Vec, got {}", + size + ); +} diff --git a/arrow-memory-size/Cargo.toml b/arrow-memory-size/Cargo.toml new file mode 100644 index 000000000000..6fda3a6e2536 --- /dev/null +++ b/arrow-memory-size/Cargo.toml @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-memory-size" +version = { workspace = true } +description = "Memory size estimation utilities for Apache Arrow" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "arrow_memory_size" +bench = false + +[package.metadata.docs.rs] +all-features = true + +[dependencies] + +[dev-dependencies] diff --git a/arrow-memory-size/LICENSE.txt b/arrow-memory-size/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-memory-size/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-memory-size/NOTICE.txt b/arrow-memory-size/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-memory-size/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-memory-size/README.md b/arrow-memory-size/README.md new file mode 100644 index 000000000000..69f7701065fd --- /dev/null +++ b/arrow-memory-size/README.md @@ -0,0 +1,231 @@ + + +# `arrow-memory-size` + +[![crates.io](https://img.shields.io/crates/v/arrow-memory-size.svg)](https://crates.io/crates/arrow-memory-size) +[![docs.rs](https://img.shields.io/docsrs/arrow-memory-size.svg)](https://docs.rs/arrow-memory-size/latest/arrow_memory_size/) + +Memory size estimation utilities for [Apache Arrow]. + +This crate provides the `HeapSize` trait for calculating heap memory usage of data structures. + +[Apache Arrow]: https://arrow.apache.org/ + +## Why This Crate? + +Several memory size estimation crates exist in the Rust ecosystem ([deepsize], [get-size2], etc.), but none has emerged as a clear standard. Rather than take a dependency on any of them, this crate provides a minimal `HeapSize` trait with a small API surface that can be implemented across the Arrow ecosystem. + +Key motivations: + +- **Minimal API**: Just two methods (`heap_size()` and `total_size()`) +- **Customizable semantics**: Behavior around `Arc`/`Rc` deduplication varies between crates and use cases; having our own trait allows us to make decisions appropriate for Arrow's needs +- **Arrow integration**: Implementations for all Arrow buffer and array types + +[deepsize]: https://github.com/Aeledfyr/deepsize +[get-size2]: https://github.com/bircni/get-size2 + +## Crate Structure + +- **`arrow-memory-size`**: Core trait + standard library implementations +- **`arrow-memory-size-derive`**: `#[derive(HeapSize)]` proc macro +- **`arrow-buffer`**: Implements `HeapSize` for buffer types (`Buffer`, `ScalarBuffer`, etc.) +- **`arrow-array`**: Implements `HeapSize` for array types (`PrimitiveArray`, `StringArray`, etc.) +- **`arrow`**: Re-exports `HeapSize` via `arrow::util::HeapSize` and `arrow::util::HeapSizeDerive` + +--- + +## Install + +If you already depend on `arrow`, the trait and derive macro are re-exported: + +```rust +use arrow::util::{HeapSize, HeapSizeDerive}; +``` + +Otherwise, add the crates directly: + +```toml +[dependencies] +arrow-memory-size = "57.0.0" +arrow-memory-size-derive = "57.0.0" # Optional, for derive macro +``` + +--- + +## Quick Start + +### Basic Usage + +```rust +use arrow_memory_size::HeapSize; + +let v: Vec = vec!["hello".to_string(), "world".to_string()]; +let heap_bytes = v.heap_size(); // Only heap allocations +let total_bytes = v.total_size(); // Stack + heap +``` + +### Derive Macro + +```rust +use arrow_memory_size::HeapSize; +use arrow_memory_size_derive::HeapSize; + +#[derive(HeapSize)] +struct MyStruct { + name: String, + data: Vec, + count: i32, +} + +let s = MyStruct { + name: "test".to_string(), + data: vec![1, 2, 3], + count: 42, +}; +println!("Heap size: {} bytes", s.heap_size()); +``` + +### Derive Macro Attributes + +The derive macro supports field attributes for customization: + +```rust,ignore +use arrow_memory_size::HeapSize; +use arrow_memory_size_derive::HeapSize; + +fn custom_size(data: &ExternalType) -> usize { + data.len() * 8 +} + +#[derive(HeapSize)] +struct MyStruct { + // Skip this field (contributes 0) + #[heap_size(ignore)] + cached: u64, + + // Use a constant value + #[heap_size(size = 1024)] + fixed_buffer: *const u8, + + // Use a custom function + #[heap_size(size_fn = custom_size)] + external: ExternalType, +} +``` + +**Note:** The derive macro emits a compile error if any field contains `Arc` or `Rc` types, unless the field is marked with `#[heap_size(ignore)]`. This is intentional—shared reference semantics are complex and vary by use case, so they should be handled explicitly. + +--- + +## Supported Types + +### Standard Library (this crate) + +| Type | Notes | +|------|-------| +| Primitives | `bool`, `i8`-`i128`, `u8`-`u128`, `f32`, `f64` — always 0 | +| `String` | Reports capacity | +| `Vec` | Capacity × element size + nested heap | +| `HashMap` | Approximation based on hashbrown internals | +| `HashSet` | Approximation based on hashbrown internals | +| `BTreeMap` | Approximation with node overhead | +| `BTreeSet` | Approximation with node overhead | +| `Box` | Size of T + nested heap | +| `Arc` | Reference counts + size of T + nested heap | +| `Option` | Nested heap if `Some` | +| Tuples | Up to 12 elements | +| Arrays `[T; N]` | Sum of element heap sizes | +| `Mutex` | Uses `try_lock()`, returns 0 if locked | +| `RwLock` | Uses `try_read()`, returns 0 if locked | + +### Arrow Buffer Types (arrow-buffer crate) + +| Type | Notes | +|------|-------| +| `Buffer` | Reports capacity | +| `ScalarBuffer` | Reports inner buffer capacity | +| `OffsetBuffer` | Reports inner buffer capacity | +| `NullBuffer` | Reports buffer capacity | +| `BooleanBuffer` | Reports inner buffer capacity | + +### Arrow Array Types (arrow-array crate) + +All array types delegate to `get_buffer_memory_size()`: + +- `PrimitiveArray`, `BooleanArray`, `NullArray` +- `StringArray`, `LargeStringArray`, `StringViewArray` +- `BinaryArray`, `LargeBinaryArray`, `BinaryViewArray` +- `ListArray`, `LargeListArray`, `ListViewArray`, `FixedSizeListArray` +- `StructArray`, `MapArray` +- `UnionArray` (sparse and dense) +- `DictionaryArray`, `RunArray` + +--- + +## Arc/Rc Handling + +This crate counts `Arc` and `Rc` fully each time they appear—shared references will be counted multiple times. This is a deliberate choice: deduplication requires threading a context/tracker through all calls, which adds API complexity and may not be the right tradeoff for all use cases. + +The derive macro enforces explicit handling by emitting compile errors for `Arc`/`Rc` fields unless they're marked with `#[heap_size(ignore)]`. + +--- + +## Examples + +### Measuring Arrow Arrays + +```rust,ignore +use arrow_memory_size::HeapSize; +use arrow_array::{Int32Array, StringArray}; + +let int_array = Int32Array::from(vec![1, 2, 3, 4, 5]); +println!("Int32Array heap: {} bytes", int_array.heap_size()); + +let string_array = StringArray::from(vec!["hello", "world"]); +println!("StringArray heap: {} bytes", string_array.heap_size()); +``` + +### Complex Nested Structures + +```rust,ignore +use arrow_memory_size::HeapSize; +use arrow_memory_size_derive::HeapSize; +use std::collections::HashMap; + +#[derive(HeapSize)] +struct CacheEntry { + key: String, + data: Vec, + metadata: HashMap, +} + +#[derive(HeapSize)] +struct Cache { + entries: Vec, + #[heap_size(ignore)] + stats: CacheStats, // Don't count internal bookkeeping +} +``` + +--- + +## License + +Licensed under the Apache License, Version 2.0. diff --git a/arrow-memory-size/src/lib.rs b/arrow-memory-size/src/lib.rs new file mode 100644 index 000000000000..65d8216c5565 --- /dev/null +++ b/arrow-memory-size/src/lib.rs @@ -0,0 +1,688 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Memory size estimation utilities for Apache Arrow +//! +//! This crate provides the [`HeapSize`] trait for calculating heap memory usage +//! of data structures, with implementations for standard library types. +//! +//! For Arrow type implementations, see: +//! - [`arrow-buffer`](https://docs.rs/arrow-buffer) for buffer types +//! - [`arrow-array`](https://docs.rs/arrow-array) for array types +//! +//! # Example +//! +//! ``` +//! use arrow_memory_size::HeapSize; +//! +//! let v: Vec = vec!["hello".to_string(), "world".to_string()]; +//! let heap_bytes = v.heap_size(); +//! let total_bytes = v.total_size(); +//! ``` + +#![doc( + html_logo_url = "https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main/docs/source/_static/images/Arrow-logo_hex_black-txt_transparent-bg.svg", + html_favicon_url = "https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main/docs/source/_static/images/Arrow-logo_hex_black-txt_transparent-bg.svg" +)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![warn(missing_docs)] + +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::sync::{Arc, Mutex, RwLock}; + +/// Trait for calculating the heap memory size of a value. +/// +/// This trait provides methods for calculating how much heap memory +/// a data structure has allocated. This is useful for memory tracking, +/// cache management, and debugging memory usage. +/// +/// # Semantics +/// +/// - [`heap_size`](HeapSize::heap_size): Returns only the bytes allocated on the heap +/// by this value, not including the size of the value itself. +/// - [`total_size`](HeapSize::total_size): Returns the total memory footprint including +/// both the stack size of the value and its heap allocations. +/// +/// # Example +/// +/// ``` +/// use arrow_memory_size::HeapSize; +/// +/// let s = String::from("hello"); +/// assert!(s.heap_size() >= 5); // At least 5 bytes for "hello" +/// assert!(s.total_size() >= s.heap_size() + std::mem::size_of::()); +/// ``` +pub trait HeapSize { + /// Return the size of any bytes allocated on the heap by this object, + /// including heap memory in nested structures. + /// + /// Note that the size of the type itself is not included in the result -- + /// instead, that size is added by the caller (e.g. container) or via + /// [`total_size`](HeapSize::total_size). + fn heap_size(&self) -> usize; + + /// Return the total size of this object including heap allocations + /// and the size of the object itself. + fn total_size(&self) -> usize { + std::mem::size_of_val(self) + self.heap_size() + } +} + +// ============================================================================= +// Standard library implementations +// ============================================================================= + +impl HeapSize for Vec { + fn heap_size(&self) -> usize { + let item_size = std::mem::size_of::(); + // Account for the Vec's buffer capacity + (self.capacity() * item_size) + + // Plus any heap allocations by the contents + self.iter().map(|t| t.heap_size()).sum::() + } +} + +impl HeapSize for HashMap { + fn heap_size(&self) -> usize { + let capacity = self.capacity(); + if capacity == 0 { + return 0; + } + + // HashMap doesn't provide a way to get its heap size, so this is an approximation based on + // the behavior of hashbrown::HashMap as at version 0.16.0, and may become inaccurate + // if the implementation changes. + let key_val_size = std::mem::size_of::<(K, V)>(); + // Overhead for the control tags group, which may be smaller depending on architecture + let group_size = 16; + // 1 byte of metadata stored per bucket. + let metadata_size = 1; + + // Compute the number of buckets for the capacity. Based on hashbrown's capacity_to_buckets + let buckets = if capacity < 15 { + let min_cap = match key_val_size { + 0..=1 => 14, + 2..=3 => 7, + _ => 3, + }; + let cap = min_cap.max(capacity); + if cap < 4 { + 4 + } else if cap < 8 { + 8 + } else { + 16 + } + } else { + (capacity.saturating_mul(8) / 7).next_power_of_two() + }; + + group_size + + (buckets * (key_val_size + metadata_size)) + + self.keys().map(|k| k.heap_size()).sum::() + + self.values().map(|v| v.heap_size()).sum::() + } +} + +impl HeapSize for HashSet { + fn heap_size(&self) -> usize { + let capacity = self.capacity(); + if capacity == 0 { + return 0; + } + + // HashSet is implemented as HashMap, so we use similar approximation + let item_size = std::mem::size_of::<(T, ())>(); + let group_size = 16; + let metadata_size = 1; + + let buckets = if capacity < 15 { + let min_cap = match item_size { + 0..=1 => 14, + 2..=3 => 7, + _ => 3, + }; + let cap = min_cap.max(capacity); + if cap < 4 { + 4 + } else if cap < 8 { + 8 + } else { + 16 + } + } else { + (capacity.saturating_mul(8) / 7).next_power_of_two() + }; + + group_size + + (buckets * (item_size + metadata_size)) + + self.iter().map(|item| item.heap_size()).sum::() + } +} + +impl HeapSize for BTreeMap { + fn heap_size(&self) -> usize { + if self.is_empty() { + return 0; + } + + // BTreeMap stores entries in nodes. This is an approximation. + // Each node has some overhead for child pointers and length tracking. + // The B parameter is typically 6 for BTreeMap, meaning nodes can hold 2B-1 = 11 entries. + let entry_size = std::mem::size_of::<(K, V)>(); + let len = self.len(); + + // Approximate: each entry + some per-node overhead + // Nodes are approximately 2/3 full on average after random insertions + let node_overhead_per_entry = 16; // Approximate overhead for pointers and metadata + + (len * (entry_size + node_overhead_per_entry)) + + self.keys().map(|k| k.heap_size()).sum::() + + self.values().map(|v| v.heap_size()).sum::() + } +} + +impl HeapSize for BTreeSet { + fn heap_size(&self) -> usize { + if self.is_empty() { + return 0; + } + + // BTreeSet is implemented as BTreeMap + let entry_size = std::mem::size_of::<(T, ())>(); + let len = self.len(); + let node_overhead_per_entry = 16; + + (len * (entry_size + node_overhead_per_entry)) + + self.iter().map(|item| item.heap_size()).sum::() + } +} + +impl HeapSize for Arc { + fn heap_size(&self) -> usize { + // Arc stores weak and strong counts on the heap alongside an instance of T + 2 * std::mem::size_of::() + std::mem::size_of::() + self.as_ref().heap_size() + } +} + +impl HeapSize for Arc { + fn heap_size(&self) -> usize { + 2 * std::mem::size_of::() + + std::mem::size_of_val(self.as_ref()) + + self.as_ref().heap_size() + } +} + +impl HeapSize for Box { + fn heap_size(&self) -> usize { + std::mem::size_of::() + self.as_ref().heap_size() + } +} + +impl HeapSize for Option { + fn heap_size(&self) -> usize { + self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0) + } +} + +impl HeapSize for String { + fn heap_size(&self) -> usize { + self.capacity() + } +} + +// Primitive types - no heap allocations + +impl HeapSize for bool { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for u8 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for u16 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for u32 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for u64 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for u128 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for usize { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for i8 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for i16 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for i32 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for i64 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for i128 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for isize { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for f32 { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for f64 { + fn heap_size(&self) -> usize { + 0 + } +} + +// ============================================================================= +// Tuple implementations (up to 12 elements) +// ============================================================================= + +impl HeapSize for () { + fn heap_size(&self) -> usize { + 0 + } +} + +impl HeapSize for (T0,) { + fn heap_size(&self) -> usize { + self.0.heap_size() + } +} + +impl HeapSize for (T0, T1) { + fn heap_size(&self) -> usize { + self.0.heap_size() + self.1.heap_size() + } +} + +impl HeapSize for (T0, T1, T2) { + fn heap_size(&self) -> usize { + self.0.heap_size() + self.1.heap_size() + self.2.heap_size() + } +} + +impl HeapSize for (T0, T1, T2, T3) { + fn heap_size(&self) -> usize { + self.0.heap_size() + self.1.heap_size() + self.2.heap_size() + self.3.heap_size() + } +} + +impl HeapSize + for (T0, T1, T2, T3, T4) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + } +} + +impl HeapSize + for (T0, T1, T2, T3, T4, T5) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + } +} + +impl< + T0: HeapSize, + T1: HeapSize, + T2: HeapSize, + T3: HeapSize, + T4: HeapSize, + T5: HeapSize, + T6: HeapSize, +> HeapSize for (T0, T1, T2, T3, T4, T5, T6) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + + self.6.heap_size() + } +} + +impl< + T0: HeapSize, + T1: HeapSize, + T2: HeapSize, + T3: HeapSize, + T4: HeapSize, + T5: HeapSize, + T6: HeapSize, + T7: HeapSize, +> HeapSize for (T0, T1, T2, T3, T4, T5, T6, T7) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + + self.6.heap_size() + + self.7.heap_size() + } +} + +impl< + T0: HeapSize, + T1: HeapSize, + T2: HeapSize, + T3: HeapSize, + T4: HeapSize, + T5: HeapSize, + T6: HeapSize, + T7: HeapSize, + T8: HeapSize, +> HeapSize for (T0, T1, T2, T3, T4, T5, T6, T7, T8) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + + self.6.heap_size() + + self.7.heap_size() + + self.8.heap_size() + } +} + +impl< + T0: HeapSize, + T1: HeapSize, + T2: HeapSize, + T3: HeapSize, + T4: HeapSize, + T5: HeapSize, + T6: HeapSize, + T7: HeapSize, + T8: HeapSize, + T9: HeapSize, +> HeapSize for (T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + + self.6.heap_size() + + self.7.heap_size() + + self.8.heap_size() + + self.9.heap_size() + } +} + +impl< + T0: HeapSize, + T1: HeapSize, + T2: HeapSize, + T3: HeapSize, + T4: HeapSize, + T5: HeapSize, + T6: HeapSize, + T7: HeapSize, + T8: HeapSize, + T9: HeapSize, + T10: HeapSize, +> HeapSize for (T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + + self.6.heap_size() + + self.7.heap_size() + + self.8.heap_size() + + self.9.heap_size() + + self.10.heap_size() + } +} + +impl< + T0: HeapSize, + T1: HeapSize, + T2: HeapSize, + T3: HeapSize, + T4: HeapSize, + T5: HeapSize, + T6: HeapSize, + T7: HeapSize, + T8: HeapSize, + T9: HeapSize, + T10: HeapSize, + T11: HeapSize, +> HeapSize for (T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11) +{ + fn heap_size(&self) -> usize { + self.0.heap_size() + + self.1.heap_size() + + self.2.heap_size() + + self.3.heap_size() + + self.4.heap_size() + + self.5.heap_size() + + self.6.heap_size() + + self.7.heap_size() + + self.8.heap_size() + + self.9.heap_size() + + self.10.heap_size() + + self.11.heap_size() + } +} + +// ============================================================================= +// Array implementation +// ============================================================================= + +impl HeapSize for [T; N] { + fn heap_size(&self) -> usize { + self.iter().map(|item| item.heap_size()).sum() + } +} + +// ============================================================================= +// Synchronization primitives +// ============================================================================= + +impl HeapSize for Mutex { + fn heap_size(&self) -> usize { + // Try to lock; if poisoned or would block, return 0 as best effort + match self.try_lock() { + Ok(guard) => guard.heap_size(), + Err(_) => 0, + } + } +} + +impl HeapSize for RwLock { + fn heap_size(&self) -> usize { + // Try to read lock; if poisoned or would block, return 0 as best effort + match self.try_read() { + Ok(guard) => guard.heap_size(), + Err(_) => 0, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_string_heap_size() { + let s = String::from("hello"); + assert!(s.heap_size() >= 5); + } + + #[test] + fn test_vec_heap_size() { + let v: Vec = vec![1, 2, 3, 4, 5]; + assert!(v.heap_size() >= 5 * std::mem::size_of::()); + } + + #[test] + fn test_nested_vec_heap_size() { + let v: Vec = vec!["hello".to_string(), "world".to_string()]; + let size = v.heap_size(); + // Should include Vec buffer + String heap allocations + assert!(size >= 10); // "hello" + "world" = 10 chars minimum + } + + #[test] + fn test_option_heap_size() { + let some: Option = Some("hello".to_string()); + let none: Option = None; + + assert!(some.heap_size() >= 5); + assert_eq!(none.heap_size(), 0); + } + + #[test] + fn test_box_heap_size() { + let b = Box::new("hello".to_string()); + let size = b.heap_size(); + // Should include String struct size + string data + assert!(size >= std::mem::size_of::() + 5); + } + + #[test] + fn test_primitive_heap_size() { + assert_eq!(42i32.heap_size(), 0); + assert_eq!(3.14f64.heap_size(), 0); + assert_eq!(true.heap_size(), 0); + } + + #[test] + fn test_total_size() { + let s = String::from("hello"); + let total = s.total_size(); + assert_eq!(total, std::mem::size_of::() + s.heap_size()); + } + + #[test] + fn test_tuple_heap_size() { + let t0: () = (); + assert_eq!(t0.heap_size(), 0); + + let t1 = ("hello".to_string(),); + assert!(t1.heap_size() >= 5); + + let t2 = ("hello".to_string(), "world".to_string()); + assert!(t2.heap_size() >= 10); + + let t3 = (1i32, "hello".to_string(), vec![1u8, 2, 3]); + assert!(t3.heap_size() >= 5 + 3); // string + vec + } + + #[test] + fn test_array_heap_size() { + let arr: [i32; 5] = [1, 2, 3, 4, 5]; + assert_eq!(arr.heap_size(), 0); // primitives have no heap + + let arr: [String; 2] = ["hello".to_string(), "world".to_string()]; + assert!(arr.heap_size() >= 10); + + let arr: [Vec; 3] = [vec![1, 2], vec![3, 4, 5], vec![6]]; + assert!(arr.heap_size() >= 6); // at least 6 bytes for elements + } + + #[test] + fn test_mutex_heap_size() { + let m = Mutex::new("hello".to_string()); + assert!(m.heap_size() >= 5); + + let m = Mutex::new(vec![1i32, 2, 3]); + assert!(m.heap_size() >= 3 * std::mem::size_of::()); + } + + #[test] + fn test_rwlock_heap_size() { + let rw = RwLock::new("hello".to_string()); + assert!(rw.heap_size() >= 5); + + let rw = RwLock::new(vec![1i32, 2, 3]); + assert!(rw.heap_size() >= 3 * std::mem::size_of::()); + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index fbb52318d36c..e2d638a039bc 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -47,6 +47,8 @@ arrow-csv = { workspace = true, optional = true } arrow-data = { workspace = true } arrow-ipc = { workspace = true, optional = true } arrow-json = { workspace = true, optional = true } +arrow-memory-size = { workspace = true } +arrow-memory-size-derive = { workspace = true } arrow-ord = { workspace = true } arrow-pyarrow = { workspace = true, optional = true } arrow-row = { workspace = true } diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 2c131669b73e..3f0e8509350d 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -21,6 +21,12 @@ pub use arrow_buffer::{bit_chunk_iterator, bit_util}; pub use arrow_data::bit_iterator; pub use arrow_data::bit_mask; +/// Re-export of [`arrow_memory_size::HeapSize`] for memory size estimation. +pub use arrow_memory_size::HeapSize; + +/// Re-export of [`arrow_memory_size_derive::HeapSize`] derive macro. +pub use arrow_memory_size_derive::HeapSize as HeapSizeDerive; + #[cfg(feature = "test_utils")] pub mod bench_util; #[cfg(feature = "test_utils")] diff --git a/dev/release/README.md b/dev/release/README.md index c89968b3ee69..431130862e85 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -243,6 +243,8 @@ Rust Arrow Crates: (cd arrow-schema && cargo publish) (cd arrow-data && cargo publish) (cd arrow-array && cargo publish) +(cd arrow-memory-size-derive && cargo publish) +(cd arrow-memory-size && cargo publish) (cd arrow-select && cargo publish) (cd arrow-ord && cargo publish) (cd arrow-cast && cargo publish) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index e02cb49874e6..b33d1bce5b1b 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -43,6 +43,7 @@ arrow-csv = { workspace = true, optional = true } arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } +arrow-memory-size = { workspace = true } arrow-ipc = { workspace = true, optional = true } parquet-geospatial = { workspace = true, optional = true } parquet-variant = { workspace = true, optional = true } diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 30c10e7f2293..9c7bd03a86cc 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -18,6 +18,10 @@ //! Memory calculations for [`ParquetMetadata::memory_size`] //! //! [`ParquetMetadata::memory_size`]: crate::file::metadata::ParquetMetaData::memory_size + +// Re-export HeapSize trait from arrow-memory-size for backward compatibility +pub use arrow_memory_size::HeapSize; + use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType}; use crate::data_type::private::ParquetValueType; use crate::file::metadata::{ @@ -29,103 +33,10 @@ use crate::file::page_index::column_index::{ }; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics::{Statistics, ValueStatistics}; -use std::collections::HashMap; -use std::sync::Arc; -/// Trait for calculating the size of various containers -pub trait HeapSize { - /// Return the size of any bytes allocated on the heap by this object, - /// including heap memory in those structures - /// - /// Note that the size of the type itself is not included in the result -- - /// instead, that size is added by the caller (e.g. container). - fn heap_size(&self) -> usize; -} - -impl HeapSize for Vec { - fn heap_size(&self) -> usize { - let item_size = std::mem::size_of::(); - // account for the contents of the Vec - (self.capacity() * item_size) + - // add any heap allocations by contents - self.iter().map(|t| t.heap_size()).sum::() - } -} - -impl HeapSize for HashMap { - fn heap_size(&self) -> usize { - let capacity = self.capacity(); - if capacity == 0 { - return 0; - } - - // HashMap doesn't provide a way to get its heap size, so this is an approximation based on - // the behavior of hashbrown::HashMap as at version 0.16.0, and may become inaccurate - // if the implementation changes. - let key_val_size = std::mem::size_of::<(K, V)>(); - // Overhead for the control tags group, which may be smaller depending on architecture - let group_size = 16; - // 1 byte of metadata stored per bucket. - let metadata_size = 1; - - // Compute the number of buckets for the capacity. Based on hashbrown's capacity_to_buckets - let buckets = if capacity < 15 { - let min_cap = match key_val_size { - 0..=1 => 14, - 2..=3 => 7, - _ => 3, - }; - let cap = min_cap.max(capacity); - if cap < 4 { - 4 - } else if cap < 8 { - 8 - } else { - 16 - } - } else { - (capacity.saturating_mul(8) / 7).next_power_of_two() - }; - - group_size - + (buckets * (key_val_size + metadata_size)) - + self.keys().map(|k| k.heap_size()).sum::() - + self.values().map(|v| v.heap_size()).sum::() - } -} - -impl HeapSize for Arc { - fn heap_size(&self) -> usize { - // Arc stores weak and strong counts on the heap alongside an instance of T - 2 * std::mem::size_of::() + std::mem::size_of::() + self.as_ref().heap_size() - } -} - -impl HeapSize for Arc { - fn heap_size(&self) -> usize { - 2 * std::mem::size_of::() - + std::mem::size_of_val(self.as_ref()) - + self.as_ref().heap_size() - } -} - -impl HeapSize for Box { - fn heap_size(&self) -> usize { - std::mem::size_of::() + self.as_ref().heap_size() - } -} - -impl HeapSize for Option { - fn heap_size(&self) -> usize { - self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0) - } -} - -impl HeapSize for String { - fn heap_size(&self) -> usize { - self.capacity() - } -} +// ============================================================================= +// Parquet-specific HeapSize implementations +// ============================================================================= impl HeapSize for FileMetaData { fn heap_size(&self) -> usize { @@ -206,6 +117,7 @@ impl HeapSize for SortingColumn { 0 // no heap allocations } } + impl HeapSize for Compression { fn heap_size(&self) -> usize { 0 // no heap allocations @@ -287,43 +199,6 @@ impl HeapSize for ValueStatistics { + self.max_opt().map(T::heap_size).unwrap_or(0) } } -impl HeapSize for bool { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} -impl HeapSize for u8 { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} -impl HeapSize for i32 { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} -impl HeapSize for i64 { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} - -impl HeapSize for f32 { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} -impl HeapSize for f64 { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} - -impl HeapSize for usize { - fn heap_size(&self) -> usize { - 0 // no heap allocations - } -} impl HeapSize for BoundaryOrder { fn heap_size(&self) -> usize { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 9304b6c25a2b..ae3ffca33ce5 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -87,7 +87,7 @@ //! * Same name, different struct //! ``` mod footer_tail; -mod memory; +pub mod memory; mod options; mod parser; mod push_decoder;