diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d0371d0..2b071f0 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,24 +4,17 @@ "name": "Python 3", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", - "features": { - "ghcr.io/devcontainers/features/node:1": {}, - "ghcr.io/devcontainers/features/python:1": {} + "ghcr.io/devcontainers/features/node:1": {} }, - // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, - // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], - // Use 'postCreateCommand' to run commands after the container is created. // "postCreateCommand": "pip3 install --user -r requirements.txt", - // Configure tool-specific properties. // "customizations": {}, - // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" -} +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 05c1df5..2dfab44 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,10 +1,10 @@ { "python.testing.pytestArgs": [ - "-v", - "./python/tests" + "-v" ], "python.testing.pytestEnabled": true, "python.testing.unittestEnabled": false, + "python.testing.cwd": "${workspaceFolder}/python", "jupyter.notebookFileRoot": "${workspaceFolder}/python", // Vitest configuration for TypeScript tests "vitest.root": "./typescript", diff --git a/python/README.md b/python/README.md index 56df366..ad6f072 100644 --- a/python/README.md +++ b/python/README.md @@ -17,12 +17,16 @@ pip install meshly - **`CustomFieldConfig`**: Configuration for custom field encoding/decoding - **`ArrayUtils`**: Utility class for encoding/decoding individual arrays - **`DataHandler`**: Unified interface for reading and writing files or zip archives +- **`CachedAssetLoader`**: Asset loader with disk cache for content-addressable storage +- **`LazyModel`**: Lazy proxy that defers asset loading until field access ### Key Capabilities - Automatic encoding/decoding of numpy array attributes, including nested dictionaries - Custom subclasses with additional array fields are automatically serialized - Custom field encoding via `_get_custom_fields()` override +- **Extract/Reconstruct API** for content-addressable storage with deduplication +- **Lazy loading** with `LazyModel` for deferred asset resolution - Enhanced polygon support with `index_sizes` and VTK-compatible `cell_types` - Mesh markers for boundary conditions, material regions, and geometric features - Mesh operations: triangulate, optimize, simplify, combine, extract @@ -187,116 +191,110 @@ loaded = SceneMesh.load_from_zip("scene.zip") # loaded.materials["wood"] is a MaterialProperties instance ``` -### Nested Packables +### Extract and Reconstruct API -Fields that are themselves `Packable` subclasses are automatically handled: +For content-addressable storage with deduplication, use the `extract()` and `reconstruct()` static methods: ```python -class PhysicsProperties(Packable): - """Physics data as a nested Packable.""" - mass: float = 1.0 - inertia_tensor: np.ndarray # 3x3 matrix +from meshly import Packable + +class SimulationResult(Packable): + """Simulation data with arrays.""" + time: float + temperature: np.ndarray + velocity: np.ndarray + +result = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0, 302.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) +) -class PhysicsMesh(Mesh): - """Mesh with nested Packable field.""" - physics: Optional[PhysicsProperties] = None +# Extract to serializable data + assets +extracted = Packable.extract(result) +# extracted.data = {"time": 0.5, "temperature": {"$ref": "abc123..."}, "velocity": {"$ref": "def456..."}} +# extracted.assets = {"abc123...": , "def456...": } -# Nested Packables use their own encode/decode methods -mesh = PhysicsMesh( - vertices=vertices, - indices=indices, - physics=PhysicsProperties( - mass=2.5, - inertia_tensor=np.eye(3, dtype=np.float32) - ) -) +# Data is JSON-serializable +import json +json.dumps(extracted.data) # Works! -mesh.save_to_zip("physics_mesh.zip") -loaded = PhysicsMesh.load_from_zip("physics_mesh.zip") -print(loaded.physics.mass) # 2.5 +# Reconstruct from data + assets (eager loading) +rebuilt = Packable.reconstruct(SimulationResult, extracted.data, extracted.assets) +assert rebuilt.time == 0.5 ``` -### Caching Nested Packables +### Lazy Loading with Callable Assets -For large projects with shared nested Packables, use caching to deduplicate data using SHA256 content-addressable storage: +When assets is a callable (or `CachedAssetLoader`), `reconstruct()` returns a `LazyModel` that defers loading: ```python -from meshly import DataHandler +from meshly import Packable, CachedAssetLoader, DataHandler +from meshly.packable import LazyModel + +# Define a fetch function (e.g., from cloud storage) +def fetch_asset(checksum: str) -> bytes: + return cloud_storage.download(checksum) -# Create cache handlers from a directory path -cache_handler = DataHandler.create("/path/to/cache") +# Reconstruct with callable - returns LazyModel +lazy = Packable.reconstruct(SimulationResult, data, fetch_asset) +assert isinstance(lazy, LazyModel) -# Save with caching - nested Packables stored separately by hash -mesh.save_to_zip("mesh.zip", cache_handler=cache_handler) +# No assets loaded yet! +print(lazy.time) # Primitive field - no fetch needed +print(lazy.temperature) # NOW temperature asset is fetched +print(lazy.velocity) # NOW velocity asset is fetched -# Load with caching - nested Packables loaded from cache -loaded = PhysicsMesh.load_from_zip("mesh.zip", cache_handler=cache_handler) +# Get full Pydantic model +model = lazy.resolve() +assert isinstance(model, SimulationResult) ``` -**Deduplication example:** +### CachedAssetLoader for Disk Persistence + +Use `CachedAssetLoader` to cache fetched assets to disk: ```python -# Two meshes sharing identical physics properties -shared_physics = PhysicsProperties(mass=1.0, inertia_tensor=np.eye(3)) +from meshly import CachedAssetLoader, DataHandler -mesh1 = PhysicsMesh(vertices=v1, indices=i1, physics=shared_physics) -mesh2 = PhysicsMesh(vertices=v2, indices=i2, physics=shared_physics) +# Create disk cache +cache = DataHandler.create("/path/to/cache") +loader = CachedAssetLoader(fetch_asset, cache) -# Save both with the same cache handler - physics stored only once! -mesh1.save_to_zip("mesh1.zip", cache_handler=cache_handler) -mesh2.save_to_zip("mesh2.zip", cache_handler=cache_handler) +# First access fetches and caches +lazy = Packable.reconstruct(SimulationResult, data, loader) +temp = lazy.temperature # Fetches from source, saves to cache + +# Subsequent access reads from cache +lazy2 = Packable.reconstruct(SimulationResult, data, loader) +temp2 = lazy2.temperature # Reads from cache, no fetch! ``` -**Custom cache handlers:** +### Deduplication with Extract -You can implement custom `DataHandler` subclasses for different storage backends: +Since assets are keyed by SHA256 checksum, identical arrays automatically deduplicate: ```python -from meshly.data_handler import DataHandler -from typing import Optional, List -from pathlib import Path - -class RedisDataHandler(DataHandler): - """Data handler backed by Redis.""" - def __init__(self, redis_client, prefix="packable:"): - super().__init__(source="", rel_path="") - self.redis = redis_client - self.prefix = prefix - - def read_binary(self, subpath) -> bytes: - data = self.redis.get(f"{self.prefix}{subpath}") - if data is None: - raise FileNotFoundError(f"Key not found: {self.prefix}{subpath}") - return data - - def read_text(self, subpath, encoding="utf-8") -> str: - return self.read_binary(subpath).decode(encoding) - - def list_files(self, subpath="", recursive=False) -> List[Path]: - raise NotImplementedError("File listing not supported") +# Two results with same temperature data +result1 = SimulationResult(time=0.0, temperature=shared_temp, velocity=v1) +result2 = SimulationResult(time=1.0, temperature=shared_temp, velocity=v2) +extracted1 = Packable.extract(result1) +extracted2 = Packable.extract(result2) -class RedisWriteHandler(WriteHandler): - """Write handler backed by Redis.""" - def __init__(self, redis_client, prefix="packable:"): - super().__init__(destination="", rel_path="") - self.redis = redis_client - self.prefix = prefix - - def write_binary(self, subpath, content, executable=False) -> None: - data = content if isinstance(content, bytes) else content.read() - self.redis.set(f"{self.prefix}{subpath}", data) - - def write_text(self, subpath, content, executable=False) -> None: - self.redis.set(f"{self.prefix}{subpath}", content.encode('utf-8')) +# Same checksum for temperature - deduplicated! +assert extracted1.data["temperature"] == extracted2.data["temperature"] +``` +**Note**: Direct Packable fields inside another Packable are not supported. Use `extract()` and `reconstruct()` for composing Packables, or embed Packables inside typed dicts: -# Usage with Redis -cache_writer = RedisWriteHandler(redis_client) -cache_reader = RedisReadHandler(redis_client) +```python +from typing import Dict -mesh.save_to_zip("mesh.zip", cache_handler=cache_writer) -loaded = PhysicsMesh.load_from_zip("mesh.zip", cache_handler=cache_reader) +class Container(Packable): + name: str + # Dict of Packables is allowed - extract() handles them + items: Dict[str, SimulationResult] = Field(default_factory=dict) ``` ## Architecture @@ -319,9 +317,11 @@ PackableMetadata (base metadata) The `Packable` base class provides: - `save_to_zip()` / `load_from_zip()` - File I/O with compression - `encode()` / `decode()` - In-memory serialization to/from bytes +- `extract()` / `reconstruct()` - Content-addressable storage with `$ref` checksums - `convert_to()` - Convert arrays between numpy and JAX - `_get_custom_fields()` - Override point for custom field encoding - `load_metadata()` - Generic metadata loading with type parameter +- `checksum` - Computed SHA256 checksum property ### Zip File Structure @@ -510,14 +510,14 @@ class CustomFieldConfig(Generic[V, M]): ```python class Packable(BaseModel): # File I/O - def save_to_zip(self, destination, cache_saver=None) -> None + def save_to_zip(self, destination, cache_handler=None) -> None @classmethod - def load_from_zip(cls, source, array_type=None, cache_loader=None) -> T + def load_from_zip(cls, source, array_type=None, cache_handler=None) -> T # In-memory serialization - def encode(self, cache_saver=None) -> bytes + def encode(self, cache_handler=None) -> bytes @classmethod - def decode(cls, buf: bytes, array_type=None, cache_loader=None) -> T + def decode(cls, buf: bytes, array_type=None, cache_handler=None) -> T # Array conversion def convert_to(self, array_type: ArrayType) -> T @@ -593,103 +593,73 @@ class MeshMetadata(PackableMetadata): array_type: ArrayType = "numpy" # "numpy" or "jax" ``` -### Cache Types +### DataHandler -```python -# Type aliases for cache callbacks -CacheLoader = Callable[[str], Optional[bytes]] # hash -> bytes or None -CacheSaver = Callable[[str, bytes], None] # hash, bytes -> None - -# Factory methods to create cache functions from paths -ReadHandler.create_cache_loader(source: PathLike) -> CacheLoader -WriteHandler.create_cache_saver(destination: PathLike) -> CacheSaver -``` - -### Data Handlers - -The `data_handler` module provides abstract interfaces for reading and writing data, supporting both regular files and zip archives. +The `data_handler` module provides a unified interface for reading and writing data, supporting both regular files and zip archives. ```python -from meshly import ReadHandler, WriteHandler +from meshly import DataHandler -# ReadHandler - Abstract base for reading files -class ReadHandler: +# DataHandler - Unified interface for file I/O +class DataHandler: def __init__(self, source: PathLike | BytesIO, rel_path: str = "") - # Abstract methods (implemented by FileReadHandler, ZipReadHandler) + # Abstract methods (implemented by FileHandler, ZipHandler) def read_text(self, subpath: PathLike, encoding: str = "utf-8") -> str def read_binary(self, subpath: PathLike) -> bytes - def list_files(self, subpath: PathLike = "", recursive: bool = False) -> List[Path] - - # Navigate to subdirectory - def to_path(self, rel_path: str) -> ReadHandler - - # Factory method - automatically creates FileReadHandler or ZipReadHandler - @staticmethod - def create_handler(source: PathLike | BytesIO, rel_path: str = "") -> ReadHandler - - # Create cache loader for nested Packables - @staticmethod - def create_cache_loader(source: PathLike | BytesIO) -> CacheLoader - -# WriteHandler - Abstract base for writing files -class WriteHandler: - def __init__(self, destination: PathLike | BytesIO, rel_path: str = "") - - # Abstract methods (implemented by FileWriteHandler, ZipWriteHandler) def write_text(self, subpath: PathLike, content: str, executable: bool = False) -> None def write_binary(self, subpath: PathLike, content: bytes | BytesIO, executable: bool = False) -> None + def list_files(self, subpath: PathLike = "", recursive: bool = False) -> List[Path] + def exists(self, subpath: PathLike) -> bool + def remove_file(self, subpath: PathLike) -> None # FileHandler only; raises NotImplementedError for ZipHandler # Navigate to subdirectory - def to_path(self, rel_path: str) -> WriteHandler + def to_path(self, rel_path: str) -> DataHandler - # Factory method - automatically creates FileWriteHandler or ZipWriteHandler + # Factory method - automatically creates FileHandler or ZipHandler @staticmethod - def create_handler(destination: PathLike | BytesIO, rel_path: str = "") -> WriteHandler + def create(source: PathLike | BytesIO, rel_path: str = "") -> DataHandler - # Create cache saver for nested Packables - @staticmethod - def create_cache_saver(destination: PathLike | BytesIO) -> CacheSaver - - # Close resources (important for ZipWriteHandler) + # Close resources (important for ZipHandler) def finalize(self) -> None + + # Context manager support (calls finalize() on exit) + def __enter__(self) -> DataHandler + def __exit__(self, exc_type, exc_val, exc_tb) -> bool ``` #### Concrete Implementations ```python -# FileReadHandler - Read from filesystem -handler = FileReadHandler("/path/to/directory") +# FileHandler - Read/write from filesystem +handler = DataHandler.create("/path/to/directory") data = handler.read_binary("subdir/file.bin") -files = handler.list_files("subdir", recursive=True) - -# ZipReadHandler - Read from zip archives -with open("archive.zip", "rb") as f: - handler = ZipReadHandler(BytesIO(f.read())) - metadata = handler.read_text("metadata.json") - array_data = handler.read_binary("arrays/vertices/array.bin") - -# FileWriteHandler - Write to filesystem -handler = FileWriteHandler("/path/to/output") handler.write_text("config.json", '{"version": 1}') -handler.write_binary("data.bin", compressed_bytes) +files = handler.list_files("subdir", recursive=True) -# ZipWriteHandler - Write to zip archives +# ZipHandler - Read/write from zip archives (using context manager) buf = BytesIO() -handler = ZipWriteHandler(buf) -handler.write_text("metadata.json", json_string) -handler.write_binary("data.bin", array_bytes) -handler.finalize() # Important: closes the zip file +with DataHandler.create(buf) as handler: + handler.write_text("metadata.json", json_string) + handler.write_binary("data.bin", array_bytes) +# finalize() is automatically called when exiting the context zip_bytes = buf.getvalue() + +# Reading from existing zip +with open("archive.zip", "rb") as f: + with DataHandler.create(BytesIO(f.read())) as handler: + metadata = handler.read_text("metadata.json") + array_data = handler.read_binary("arrays/vertices/array.bin") ``` #### Advanced Usage ```python # Use handlers for custom storage backends -class S3ReadHandler(ReadHandler): - """Custom handler for reading from S3.""" +class S3DataHandler(DataHandler): + """Custom handler for reading/writing from S3.""" def __init__(self, bucket: str, prefix: str = ""): + super().__init__(source="", rel_path="") self.bucket = bucket self.prefix = prefix @@ -697,12 +667,36 @@ class S3ReadHandler(ReadHandler): key = f"{self.prefix}/{subpath}" if self.prefix else str(subpath) return s3_client.get_object(Bucket=self.bucket, Key=key)['Body'].read() + def write_binary(self, subpath: PathLike, content: bytes | BytesIO, executable: bool = False) -> None: + if isinstance(content, BytesIO): + content.seek(0) + content = content.read() + key = f"{self.prefix}/{subpath}" if self.prefix else str(subpath) + s3_client.put_object(Bucket=self.bucket, Key=key, Body=content) + + def exists(self, subpath: PathLike) -> bool: + key = f"{self.prefix}/{subpath}" if self.prefix else str(subpath) + try: + s3_client.head_object(Bucket=self.bucket, Key=key) + return True + except: + return False + # ... implement other methods -# Deterministic zip output (ZipWriteHandler uses fixed timestamps) +# Deterministic zip output (ZipHandler uses fixed timestamps) # This ensures identical content produces identical zip files -handler = ZipWriteHandler(buf) +handler = DataHandler.create(buf) # All files get timestamp (2020, 1, 1, 0, 0, 0) for reproducibility + +# Automatic mode switching for ZipHandler +handler = DataHandler.create(BytesIO()) +# Handler starts in write mode for empty buffer +handler.write_binary("file1.bin", data1) +# Automatically switches to read mode when needed +content = handler.read_binary("file1.bin") +# Switches back to write mode +handler.write_binary("file2.bin", data2) ``` ## Examples diff --git a/python/examples/extract_reconstruct_example.ipynb b/python/examples/extract_reconstruct_example.ipynb new file mode 100644 index 0000000..840192a --- /dev/null +++ b/python/examples/extract_reconstruct_example.ipynb @@ -0,0 +1,743 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ba736f7c", + "metadata": {}, + "source": [ + "# Extract and Reconstruct: Scientific Simulation Example\n", + "\n", + "This notebook demonstrates `Packable.extract()` and `reconstruct()` with a realistic scientific computing scenario:\n", + "\n", + "- A CFD simulation with mesh geometry and field data\n", + "- Nested Pydantic classes containing Packables (Mesh)\n", + "- Content-addressable storage for deduplication" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6f850881", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from meshly import Mesh, Packable\n", + "from pydantic import BaseModel, ConfigDict, Field" + ] + }, + { + "cell_type": "markdown", + "id": "d3ae1bf6", + "metadata": {}, + "source": [ + "## 1. Define Scientific Data Structures\n", + "\n", + "We'll model a CFD simulation with:\n", + "- `FieldData`: Scalar/vector field on the mesh (temperature, velocity, etc.)\n", + "- `SimulationSnapshot`: A single timestep with mesh + fields\n", + "- `SimulationCase`: Complete case with metadata and multiple snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "349483ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data structures defined\n" + ] + } + ], + "source": [ + "class FieldData(BaseModel):\n", + " \"\"\"A field defined on mesh nodes or cells.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " name: str = Field(..., description=\"Field name (e.g., 'temperature', 'velocity')\")\n", + " field_type: str = Field(..., description=\"'scalar', 'vector', or 'tensor'\")\n", + " location: str = Field(\"node\", description=\"'node' or 'cell' centered\")\n", + " data: np.ndarray = Field(..., description=\"Field values\")\n", + " units: str | None = Field(None, description=\"Physical units\")\n", + "\n", + "\n", + "class SimulationSnapshot(BaseModel):\n", + " \"\"\"A single timestep of simulation data.\n", + " \n", + " Note: This is a regular Pydantic BaseModel (not Packable) that contains\n", + " a Mesh (which IS a Packable). This tests the nested Packable extraction.\n", + " \"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " time: float = Field(..., description=\"Simulation time\")\n", + " iteration: int = Field(..., description=\"Iteration number\")\n", + " mesh: Mesh = Field(..., description=\"Computational mesh\")\n", + " fields: dict[str, FieldData] = Field(default_factory=dict, description=\"Field data\")\n", + " residuals: np.ndarray | None = Field(None, description=\"Solver residuals\")\n", + "\n", + "\n", + "class SimulationCase(BaseModel):\n", + " \"\"\"Complete simulation case with multiple snapshots.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " name: str = Field(..., description=\"Case name\")\n", + " description: str = Field(\"\", description=\"Case description\")\n", + " solver: str = Field(..., description=\"Solver name\")\n", + " parameters: dict[str, float] = Field(default_factory=dict, description=\"Solver parameters\")\n", + " snapshots: list[SimulationSnapshot] = Field(default_factory=list, description=\"Time snapshots\")\n", + "\n", + "print(\"Data structures defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcb88dff", + "metadata": {}, + "source": [ + "## 2. Create Sample Simulation Data\n", + "\n", + "Let's create a simple 2D heat transfer simulation on a quad mesh." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be109c7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created mesh: 25 vertices, 16 quads\n" + ] + } + ], + "source": [ + "# Create a simple 2D quad mesh (5x5 grid = 25 nodes, 16 quads)\n", + "nx, ny = 5, 5\n", + "x = np.linspace(0, 1, nx)\n", + "y = np.linspace(0, 1, ny)\n", + "xx, yy = np.meshgrid(x, y)\n", + "\n", + "vertices = np.column_stack([xx.ravel(), yy.ravel(), np.zeros(nx * ny)]).astype(np.float32)\n", + "\n", + "# Create quad indices\n", + "quads = []\n", + "for j in range(ny - 1):\n", + " for i in range(nx - 1):\n", + " n0 = j * nx + i\n", + " n1 = n0 + 1\n", + " n2 = n0 + nx + 1\n", + " n3 = n0 + nx\n", + " quads.append([n0, n1, n2, n3])\n", + "\n", + "indices = np.array(quads, dtype=np.uint32)\n", + "\n", + "mesh = Mesh(vertices=vertices, indices=indices)\n", + "print(f\"Created mesh: {mesh.vertex_count} vertices, {len(indices)} quads\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c7588b21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created 3 snapshots\n", + " t=0.0: ['temperature', 'velocity']\n", + " t=0.1: ['temperature', 'velocity']\n", + " t=0.2: ['temperature', 'velocity']\n" + ] + } + ], + "source": [ + "# Create simulation snapshots at different times\n", + "def create_snapshot(time: float, iteration: int, mesh: Mesh) -> SimulationSnapshot:\n", + " \"\"\"Create a snapshot with temperature and velocity fields.\"\"\"\n", + " n_nodes = mesh.vertex_count\n", + " coords = mesh.vertices[:, :2] # x, y coordinates\n", + "\n", + " # Temperature: diffusing heat from center\n", + " center = np.array([0.5, 0.5])\n", + " r = np.linalg.norm(coords - center, axis=1)\n", + " temperature = 300 + 100 * np.exp(-r**2 / (0.1 + time))\n", + "\n", + " # Velocity: rotating flow\n", + " vx = -(coords[:, 1] - 0.5)\n", + " vy = (coords[:, 0] - 0.5)\n", + " velocity = np.column_stack([vx, vy, np.zeros(n_nodes)]).astype(np.float32)\n", + "\n", + " # Residuals (solver convergence)\n", + " residuals = np.array([1e-3 / (iteration + 1), 1e-4 / (iteration + 1)], dtype=np.float32)\n", + "\n", + " return SimulationSnapshot(\n", + " time=time,\n", + " iteration=iteration,\n", + " mesh=mesh,\n", + " fields={\n", + " \"temperature\": FieldData(\n", + " name=\"temperature\",\n", + " field_type=\"scalar\",\n", + " location=\"node\",\n", + " data=temperature.astype(np.float32),\n", + " units=\"K\"\n", + " ),\n", + " \"velocity\": FieldData(\n", + " name=\"velocity\",\n", + " field_type=\"vector\",\n", + " location=\"node\",\n", + " data=velocity,\n", + " units=\"m/s\"\n", + " )\n", + " },\n", + " residuals=residuals\n", + " )\n", + "\n", + "# Create snapshots at t=0, 0.1, 0.2\n", + "snapshots = [\n", + " create_snapshot(0.0, 0, mesh),\n", + " create_snapshot(0.1, 100, mesh),\n", + " create_snapshot(0.2, 200, mesh),\n", + "]\n", + "\n", + "print(f\"Created {len(snapshots)} snapshots\")\n", + "for s in snapshots:\n", + " print(f\" t={s.time}: {list(s.fields.keys())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "93568d04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simulation case: heat_transfer_2d\n", + " Solver: simpleFoam\n", + " Parameters: {'dt': 0.001, 'nu': 1e-05, 'alpha': 0.0001}\n", + " Snapshots: 3\n" + ] + } + ], + "source": [ + "# Create the complete simulation case\n", + "case = SimulationCase(\n", + " name=\"heat_transfer_2d\",\n", + " description=\"2D heat transfer with rotating flow\",\n", + " solver=\"simpleFoam\",\n", + " parameters={\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-5,\n", + " \"alpha\": 1e-4,\n", + " },\n", + " snapshots=snapshots\n", + ")\n", + "\n", + "print(f\"Simulation case: {case.name}\")\n", + "print(f\" Solver: {case.solver}\")\n", + "print(f\" Parameters: {case.parameters}\")\n", + "print(f\" Snapshots: {len(case.snapshots)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c7048da", + "metadata": {}, + "source": [ + "## 3. Extract the Simulation Data\n", + "\n", + "`Packable.extract()` recursively processes the nested structure:\n", + "- Arrays → `{\"$ref\": checksum, \"$type\": \"array\"}`\n", + "- Nested Mesh (Packable) → `{\"$ref\": checksum, \"$type\": \"packable\", ...}`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "95533188", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data keys: ['name', 'description', 'solver', 'parameters', 'snapshots']\n", + "\n", + "Total assets: 8\n", + "\n", + "Asset sizes:\n", + " 4e71a79c2d0fa381: 1,467 bytes\n", + " 28dc719a0c8c1387: 200 bytes\n", + " 59ffdd6bfac7876a: 250 bytes\n", + " 0c345962a52e7e2c: 133 bytes\n", + " 292cfc23f6777b02: 200 bytes\n", + " 17b38a2f2cbdd0a7: 133 bytes\n", + " 145838c08771e6ef: 201 bytes\n", + " ea37b2590dba4b31: 132 bytes\n" + ] + } + ], + "source": [ + "# Extract the entire simulation case\n", + "extracted = Packable.extract(case)\n", + "\n", + "print(f\"Extracted data keys: {list(extracted.data.keys())}\")\n", + "print(f\"\\nTotal assets: {len(extracted.assets)}\")\n", + "print(\"\\nAsset sizes:\")\n", + "for checksum, data in extracted.assets.items():\n", + " print(f\" {checksum}: {len(data):,} bytes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba82742d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data structure:\n", + "{\n", + " \"name\": \"heat_transfer_2d\",\n", + " \"description\": \"2D heat transfer with rotating flow\",\n", + " \"solver\": \"simpleFoam\",\n", + " \"parameters\": {\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-05,\n", + " \"alpha\": 0.0001\n", + " },\n", + " \"snapshots\": [\n", + " {\n", + " \"time\": 0.0,\n", + " \"iteration\": 0,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"28dc719a0c8c1387\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"0c345962a52e7e2c\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.1,\n", + " \"iteration\": 100,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"292cfc23f6777b02\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"17b38a2f2cbdd0a7\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.2,\n", + " \"iteration\": 200,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"145838c08771e6ef\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \n", + "...\n" + ] + } + ], + "source": [ + "# Examine the extracted data structure\n", + "import json\n", + "\n", + "# Pretty print the extracted data (it's JSON-serializable!)\n", + "print(\"Extracted data structure:\")\n", + "print(json.dumps(extracted.data, indent=2)[:2000] + \"\\n...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6977cb53", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh reference: {'$ref': '4e71a79c2d0fa381'}\n" + ] + } + ], + "source": [ + "# Look at the first snapshot's mesh reference\n", + "mesh_ref = extracted.data[\"snapshots\"][0][\"mesh\"]\n", + "print(f\"Mesh reference: {mesh_ref}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "bc82716a", + "metadata": {}, + "source": [ + "## 4. Asset Deduplication\n", + "\n", + "Since all snapshots share the same mesh, it's only stored once!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a251ef65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh checksums: ['4e71a79c2d0fa381', '4e71a79c2d0fa381', '4e71a79c2d0fa381']\n", + "\n", + "All same? True\n", + "\n", + "The mesh is stored only ONCE in assets, saving 2,934 bytes!\n" + ] + } + ], + "source": [ + "# Check mesh references across snapshots\n", + "mesh_refs = [s[\"mesh\"][\"$ref\"] for s in extracted.data[\"snapshots\"]]\n", + "print(f\"Mesh checksums: {mesh_refs}\")\n", + "print(f\"\\nAll same? {len(set(mesh_refs)) == 1}\")\n", + "print(f\"\\nThe mesh is stored only ONCE in assets, saving {(len(mesh_refs)-1) * len(extracted.assets[mesh_refs[0]]):,} bytes!\")" + ] + }, + { + "cell_type": "markdown", + "id": "b732526c", + "metadata": {}, + "source": [ + "## 5. Reconstruct back to SimulationCase" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5c3761f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Reconstructed case: heat_transfer_2d with 3 snapshots\n", + "Decoded mesh from reconstructed case: 25 vertices, 64 indices\n" + ] + } + ], + "source": [ + "reconstructed_case = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets)\n", + "print(f\"\\nReconstructed case: {reconstructed_case.name} with {len(reconstructed_case.snapshots)} snapshots\")\n", + "\n", + "decoded_mesh = Mesh.decode(reconstructed_case.snapshots[0].mesh.encode())\n", + "print(f\"Decoded mesh from reconstructed case: {decoded_mesh.vertex_count} vertices, {len(decoded_mesh.indices)} indices\")" + ] + }, + { + "cell_type": "markdown", + "id": "ccaf56b9", + "metadata": {}, + "source": [ + "## 6. Lazy Loading with CachedAssetLoader\n", + "\n", + "When working with large datasets, you may want to:\n", + "- Load assets on-demand (lazy loading)\n", + "- Cache fetched assets to disk for subsequent runs\n", + "\n", + "`Packable.reconstruct()` supports this via `CachedAssetLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ac9c08e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Lazy loading with callable ===\n", + "\n", + "LazyModel created, no assets fetched yet. Fetch count: 0\n", + "Type: \n", + "\n", + "Case name: heat_transfer_2d\n", + "Fetch count after accessing name: 0\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "from meshly.data_handler import DataHandler\n", + "from meshly.packable import CachedAssetLoader\n", + "\n", + "# Simulate fetching assets from remote storage\n", + "fetch_count = [0]\n", + "\n", + "def fetch_from_storage(checksum: str) -> bytes:\n", + " \"\"\"Simulate fetching from cloud/remote storage.\"\"\"\n", + " fetch_count[0] += 1\n", + " print(f\" Fetching asset {checksum[:8]}... (fetch #{fetch_count[0]})\")\n", + " return extracted.assets[checksum]\n", + "\n", + "# Using a plain callable - lazy loading, assets fetched on field access\n", + "print(\"=== Lazy loading with callable ===\")\n", + "lazy_case = Packable.reconstruct(SimulationCase, extracted.data, fetch_from_storage)\n", + "\n", + "print(f\"\\nLazyModel created, no assets fetched yet. Fetch count: {fetch_count[0]}\")\n", + "print(f\"Type: {type(lazy_case)}\")\n", + "\n", + "# Access primitive fields - no fetch needed\n", + "print(f\"\\nCase name: {lazy_case.name}\")\n", + "print(f\"Fetch count after accessing name: {fetch_count[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "38bd4003", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Accessing first snapshot ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 4e71a79c... (fetch #5)\n", + " Fetching asset 292cfc23... (fetch #6)\n", + " Fetching asset 59ffdd6b... (fetch #7)\n", + " Fetching asset 17b38a2f... (fetch #8)\n", + " Fetching asset 4e71a79c... (fetch #9)\n", + " Fetching asset 145838c0... (fetch #10)\n", + " Fetching asset 59ffdd6b... (fetch #11)\n", + " Fetching asset ea37b259... (fetch #12)\n", + "Fetch count after accessing snapshots: 12\n", + "\n", + "Snapshot time: 0.0\n", + "Mesh vertices shape: (25, 3)\n", + "\n", + "=== Resolving to full model ===\n", + "Final fetch count: 12\n", + "Resolved type: \n" + ] + } + ], + "source": [ + "# Access a snapshot - this triggers fetching of nested assets\n", + "print(\"=== Accessing first snapshot ===\")\n", + "snapshot = lazy_case.snapshots[0]\n", + "print(f\"Fetch count after accessing snapshots: {fetch_count[0]}\")\n", + "\n", + "# The mesh is fetched when we access it\n", + "print(f\"\\nSnapshot time: {snapshot.time}\")\n", + "print(f\"Mesh vertices shape: {snapshot.mesh.vertices.shape}\")\n", + "\n", + "# To fully resolve and get the actual Pydantic model:\n", + "print(\"\\n=== Resolving to full model ===\")\n", + "resolved_case = lazy_case.resolve()\n", + "print(f\"Final fetch count: {fetch_count[0]}\")\n", + "print(f\"Resolved type: {type(resolved_case)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "46d7b7c0", + "metadata": {}, + "source": [ + "### CachedAssetLoader: Persistent Disk Cache\n", + "\n", + "For repeated access, use `CachedAssetLoader` to cache fetched assets to disk:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "88d9c7be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== First run: fetching and caching ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 292cfc23... (fetch #5)\n", + " Fetching asset 17b38a2f... (fetch #6)\n", + " Fetching asset 145838c0... (fetch #7)\n", + " Fetching asset ea37b259... (fetch #8)\n", + "Assets fetched: 8\n", + "\n", + "=== Second run: reading from cache ===\n", + "Assets fetched from remote: 0 (all served from cache!)\n", + "Resolved case: heat_transfer_2d with 3 snapshots\n" + ] + } + ], + "source": [ + "import tempfile\n", + "\n", + "# Reset fetch counter\n", + "fetch_count[0] = 0\n", + "\n", + "with tempfile.TemporaryDirectory() as tmpdir:\n", + " cache_path = Path(tmpdir) / \"asset_cache\"\n", + "\n", + " # Create cache handler and loader\n", + " cache_handler = DataHandler.create(cache_path)\n", + " loader = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler)\n", + "\n", + " print(\"=== First run: fetching and caching ===\")\n", + " lazy1 = Packable.reconstruct(SimulationCase, extracted.data, loader)\n", + " _ = lazy1.resolve() # Fetch all assets\n", + " print(f\"Assets fetched: {fetch_count[0]}\")\n", + "\n", + " # Finalize to persist cache\n", + " cache_handler.finalize()\n", + "\n", + " # Second run with same cache location\n", + " print(\"\\n=== Second run: reading from cache ===\")\n", + " fetch_count[0] = 0\n", + " cache_handler2 = DataHandler.create(cache_path)\n", + " loader2 = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler2)\n", + "\n", + " lazy2 = Packable.reconstruct(SimulationCase, extracted.data, loader2)\n", + " resolved2 = lazy2.resolve()\n", + " print(f\"Assets fetched from remote: {fetch_count[0]} (all served from cache!)\")\n", + " print(f\"Resolved case: {resolved2.name} with {len(resolved2.snapshots)} snapshots\")" + ] + }, + { + "cell_type": "markdown", + "id": "1a54dcde", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "`Packable.extract()` is a **static method** that handles:\n", + "\n", + "| Input | Handling |\n", + "|-------|----------|\n", + "| Top-level Packable | Expands fields, arrays → refs |\n", + "| Nested Packable (in dict/list/BaseModel) | Becomes `{\"$ref\": checksum}` |\n", + "| NumPy arrays | Becomes `{\"$ref\": checksum}` |\n", + "| BaseModel | Recursively extracts fields |\n", + "| Primitives | Passed through unchanged |\n", + "\n", + "`Packable.reconstruct()` supports three modes:\n", + "\n", + "| AssetProvider | Result | Use Case |\n", + "|--------------|--------|----------|\n", + "| `Dict[str, bytes]` | `TModel` | Eager loading, all assets in memory |\n", + "| `AssetFetcher` | `LazyModel[TModel]` | Lazy per-field loading |\n", + "| `CachedAssetLoader` | `LazyModel[TModel]` | Lazy loading with disk cache |\n", + "\n", + "Key benefits for scientific computing:\n", + "- **Deduplication**: Shared meshes/arrays stored once\n", + "- **Lazy loading**: Load only the fields you need with `LazyModel`\n", + "- **Persistent caching**: `CachedAssetLoader` caches fetched assets to disk\n", + "- **JSON metadata**: Easy to query/index simulation cases\n", + "- **Version control friendly**: Small metadata files, large binary assets" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/examples/mesh_example.ipynb b/python/examples/mesh_example.ipynb index 31ea71e..f9e6cda 100644 --- a/python/examples/mesh_example.ipynb +++ b/python/examples/mesh_example.ipynb @@ -20,13 +20,13 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", + "from pathlib import Path\n", + "\n", "import numpy as np\n", - "from typing import Optional, List\n", - "from pydantic import Field\n", "\n", "# Import the Mesh class\n", - "from meshly import Mesh" + "from meshly import Mesh\n", + "from pydantic import Field" ] }, { @@ -44,13 +44,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pydantic import BaseModel, ConfigDict\n", "from meshly import Packable\n", + "from pydantic import BaseModel, ConfigDict\n", + "\n", "\n", "class MaterialProperties(BaseModel):\n", " \"\"\"Material properties with numpy arrays - demonstrates BaseModel in dict edge case.\"\"\"\n", " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - " \n", + "\n", " name: str = Field(..., description=\"Material name\")\n", " diffuse: np.ndarray = Field(..., description=\"Diffuse color array\")\n", " specular: np.ndarray = Field(..., description=\"Specular color array\")\n", @@ -75,11 +76,11 @@ " \"\"\"\n", " # Add texture coordinates and normals as additional numpy arrays\n", " texture_coords: np.ndarray = Field(..., description=\"Texture coordinates\")\n", - " normals: Optional[np.ndarray] = Field(None, description=\"Vertex normals\")\n", - " \n", + " normals: np.ndarray | None = Field(None, description=\"Vertex normals\")\n", + "\n", " # Add non-array attributes\n", " material_name: str = Field(\"default\", description=\"Material name\")\n", - " tags: List[str] = Field(default_factory=list, description=\"Tags for the mesh\")\n", + " tags: list[str] = Field(default_factory=list, description=\"Tags for the mesh\")\n", "\n", " # Dictionary containing nested dictionaries with arrays\n", " material_data: dict[str, dict[str, np.ndarray]] = Field(\n", @@ -97,13 +98,6 @@ " materials: dict[str, MaterialProperties] = Field(\n", " default_factory=dict,\n", " description=\"Dictionary of material name to MaterialProperties (BaseModel with arrays)\"\n", - " )\n", - " \n", - " # Nested Packable field - uses its own encode/decode methods\n", - " # This demonstrates automatic handling of Packable fields within other Packables\n", - " physics: Optional[PhysicsProperties] = Field(\n", - " None,\n", - " description=\"Physics properties as a nested Packable\"\n", " )" ] }, @@ -128,8 +122,7 @@ "Mesh created with 8 vertices and 36 indices\n", "Material name: cube_material\n", "Tags: ['cube', 'example']\n", - "Materials (BaseModel dict): ['cube_material', 'secondary_material']\n", - "Physics (nested Packable): mass=2.5, friction=0.7\n" + "Materials (BaseModel dict): ['cube_material', 'secondary_material']\n" ] } ], @@ -237,8 +230,7 @@ "print(f\"Mesh created with {mesh.vertex_count} vertices and {mesh.index_count} indices\")\n", "print(f\"Material name: {mesh.material_name}\")\n", "print(f\"Tags: {mesh.tags}\")\n", - "print(f\"Materials (BaseModel dict): {list(mesh.materials.keys())}\")\n", - "print(f\"Physics (nested Packable): mass={mesh.physics.mass}, friction={mesh.physics.friction}\")" + "print(f\"Materials (BaseModel dict): {list(mesh.materials.keys())}\")" ] }, { @@ -297,15 +289,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Saved mesh to textured_cube.zip, file size: 7695 bytes\n" + "Saved mesh to textured_cube.zip has 8 vertices and 36 indices\n", + "Decoded mesh has 8 vertices and 36 indices\n" ] } ], "source": [ "# Save the mesh to a zip file\n", - "zip_path = \"textured_cube.zip\"\n", + "zip_path = Path(\"textured_cube.zip\")\n", "mesh.save_to_zip(zip_path)\n", - "print(f\"Saved mesh to {zip_path}, file size: {os.path.getsize(zip_path)} bytes\")" + "assert zip_path.exists()\n", + "print(f\"Saved mesh to {zip_path} has {mesh.vertex_count} vertices and {mesh.index_count} indices\")\n", + "\n", + "\n", + "decoded_mesh = Mesh.decode(mesh.encode())\n", + "print(f\"Decoded mesh has {decoded_mesh.vertex_count} vertices and {decoded_mesh.index_count} indices\")" ] }, { @@ -346,20 +344,7 @@ " type: MaterialProperties\n", " diffuse: [0.2 0.8 0.2]\n", " specular: [0.3 0.3 0.3]\n", - " shininess: 16.0\n", - "\n", - "--- Nested Packable edge case ---\n", - "Physics type: PhysicsProperties\n", - "Physics mass: 2.5\n", - "Physics friction: 0.7\n", - "Physics inertia_tensor:\n", - "[[0.1 0. 0. ]\n", - " [0. 0.1 0. ]\n", - " [0. 0. 0.1]]\n", - "Physics collision_points:\n", - "[[-0.5 -0.5 -0.5]\n", - " [ 0.5 0.5 0.5]\n", - " [ 0. 0. 0. ]]\n" + " shininess: 16.0\n" ] } ], @@ -377,22 +362,14 @@ "print(f\"Material colors: {loaded_mesh.material_colors}\")\n", "\n", "# Verify the dict[str, BaseModel] edge case was loaded correctly\n", - "print(f\"\\n--- BaseModel dict edge case ---\")\n", + "print(\"\\n--- BaseModel dict edge case ---\")\n", "print(f\"Materials keys: {list(loaded_mesh.materials.keys())}\")\n", "for mat_name, mat in loaded_mesh.materials.items():\n", " print(f\" {mat_name}:\")\n", " print(f\" type: {type(mat).__name__}\")\n", " print(f\" diffuse: {mat.diffuse}\")\n", " print(f\" specular: {mat.specular}\")\n", - " print(f\" shininess: {mat.shininess}\")\n", - "\n", - "# Verify the nested Packable was loaded correctly\n", - "print(f\"\\n--- Nested Packable edge case ---\")\n", - "print(f\"Physics type: {type(loaded_mesh.physics).__name__}\")\n", - "print(f\"Physics mass: {loaded_mesh.physics.mass}\")\n", - "print(f\"Physics friction: {loaded_mesh.physics.friction}\")\n", - "print(f\"Physics inertia_tensor:\\n{loaded_mesh.physics.inertia_tensor}\")\n", - "print(f\"Physics collision_points:\\n{loaded_mesh.physics.collision_points}\")" + " print(f\" shininess: {mat.shininess}\")\n" ] }, { @@ -429,10 +406,10 @@ " # Add bone weights and indices as additional numpy arrays\n", " bone_weights: np.ndarray = Field(..., description=\"Bone weights for each vertex\")\n", " bone_indices: np.ndarray = Field(..., description=\"Bone indices for each vertex\")\n", - " \n", + "\n", " # Add non-array attributes\n", " skeleton_name: str = Field(\"default\", description=\"Skeleton name\")\n", - " animation_names: List[str] = Field(default_factory=list, description=\"Animation names\")\n", + " animation_names: list[str] = Field(default_factory=list, description=\"Animation names\")\n", "\n", "# Create a simple skinned mesh\n", "skinned_mesh = SkinnedMesh(\n", @@ -469,7 +446,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Saved skinned mesh to skinned_cube.zip, file size: 2562 bytes\n", + "Saved skinned mesh to skinned_cube.zip, file size: 2477 bytes\n", "\n", "Loaded skinned mesh: 8 vertices, 36 indices\n", "Skeleton name: human_skeleton\n", @@ -481,9 +458,9 @@ ], "source": [ "# Save the skinned mesh to a zip file\n", - "skinned_zip_path = \"skinned_cube.zip\"\n", + "skinned_zip_path = Path(\"skinned_cube.zip\")\n", "skinned_mesh.save_to_zip(skinned_zip_path)\n", - "print(f\"Saved skinned mesh to {skinned_zip_path}, file size: {os.path.getsize(skinned_zip_path)} bytes\")\n", + "print(f\"Saved skinned mesh to {skinned_zip_path}, file size: {skinned_zip_path.stat().st_size} bytes\")\n", "\n", "# Load the skinned mesh from the zip file\n", "loaded_skinned_mesh = SkinnedMesh.load_from_zip(skinned_zip_path)\n", @@ -522,8 +499,8 @@ "source": [ "# Clean up\n", "for path in [zip_path, skinned_zip_path]:\n", - " if os.path.exists(path):\n", - " os.remove(path)\n", + " if Path(path).exists():\n", + " Path(path).unlink()\n", " print(f\"Removed {path}\")\n", "\n", "print(\"\\nExample completed successfully!\")" @@ -533,85 +510,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 9. Using Cache for Nested Packables\n", + "## 9. Using Callbacks for Nested Packables\n", "\n", - "When working with meshes that contain nested Packables (like our `TexturedMesh` with `PhysicsProperties`), you can use caching to deduplicate shared data and reduce file sizes. The cache uses SHA256 hashes for content-addressable storage." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cache directory: /tmp/tmpkflwgz8v\n", - "\n", - "Cached files (1 items):\n", - " 210dc1059e9d5af349f0dad45dbbdc8797eb82b49e7a3443528337e33ce60854.zip: 1157 bytes\n", - "\n", - "Original zip size: 0 bytes\n", - "Cached zip size: 6505 bytes\n", - "\n", - "--- Loaded from cache ---\n", - "Physics type: PhysicsProperties\n", - "Physics mass: 2.5\n", - "Physics friction: 0.7\n", - "Physics inertia_tensor:\n", - "[[0.1 0. 0. ]\n", - " [0. 0.1 0. ]\n", - " [0. 0. 0.1]]\n", - "\n", - "Removed textured_cube_cached.zip\n" - ] - } - ], - "source": [ - "import tempfile\n", - "from meshly import ReadHandler, WriteHandler\n", + "When working with meshes that contain nested Packables (like our `TexturedMesh` with `PhysicsProperties`), you can use callbacks to implement custom caching, storage, or deduplication strategies.\n", "\n", - "# Create a temporary cache directory\n", - "with tempfile.TemporaryDirectory() as cache_dir:\n", - " print(f\"Cache directory: {cache_dir}\")\n", - " \n", - " # Create cache saver and loader using the handler factory methods\n", - " cache_saver = WriteHandler.create_cache_saver(cache_dir)\n", - " cache_loader = ReadHandler.create_cache_loader(cache_dir)\n", - " \n", - " # Save the mesh with caching - nested PhysicsProperties will be cached separately\n", - " cached_zip_path = \"textured_cube_cached.zip\"\n", - " mesh.save_to_zip(cached_zip_path, cache_saver=cache_saver)\n", - " \n", - " # Check what was cached\n", - " import os\n", - " cache_files = os.listdir(cache_dir)\n", - " print(f\"\\nCached files ({len(cache_files)} items):\")\n", - " for f in cache_files:\n", - " file_path = os.path.join(cache_dir, f)\n", - " print(f\" {f}: {os.path.getsize(file_path)} bytes\")\n", - " \n", - " # Compare file sizes\n", - " original_size = os.path.getsize(zip_path) if os.path.exists(zip_path) else 0\n", - " cached_size = os.path.getsize(cached_zip_path)\n", - " print(f\"\\nOriginal zip size: {original_size} bytes\")\n", - " print(f\"Cached zip size: {cached_size} bytes\")\n", - " \n", - " # Load the mesh back using the cache\n", - " loaded_cached_mesh = TexturedMesh.load_from_zip(cached_zip_path, cache_loader=cache_loader)\n", - " \n", - " # Verify the nested Packable was loaded correctly from cache\n", - " print(f\"\\n--- Loaded from cache ---\")\n", - " print(f\"Physics type: {type(loaded_cached_mesh.physics).__name__}\")\n", - " print(f\"Physics mass: {loaded_cached_mesh.physics.mass}\")\n", - " print(f\"Physics friction: {loaded_cached_mesh.physics.friction}\")\n", - " print(f\"Physics inertia_tensor:\\n{loaded_cached_mesh.physics.inertia_tensor}\")\n", - " \n", - " # Clean up\n", - " if os.path.exists(cached_zip_path):\n", - " os.remove(cached_zip_path)\n", - " print(f\"\\nRemoved {cached_zip_path}\")" + "The callback types are:\n", + "- **`on_packable` (save)**: `Callable[[Packable, str], None]` - called with `(packable, checksum)` when saving\n", + "- **`on_packable` (load)**: `Callable[[Type[Packable], str], Optional[Packable]]` - called with `(packable_type, checksum)` when loading; return `None` to fall back to embedded data" ] }, { @@ -623,78 +528,6 @@ "When multiple meshes share the same nested Packable data, the cache automatically deduplicates them using SHA256 hashes." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Cache directory for deduplication: /workspaces/meshly/cache\n", - "Cache entries: 1 (both meshes share the same physics cache)\n", - "\n", - "Mesh1 material: mesh1, physics mass: 1.0\n", - "Mesh2 material: mesh2, physics mass: 1.0\n" - ] - } - ], - "source": [ - "# Demonstrate cache deduplication - two meshes with identical physics properties\n", - "with tempfile.TemporaryDirectory() as cache_dir:\n", - " print(f\"\\nCache directory for deduplication: {cache_dir}\")\n", - " cache_saver = WriteHandler.create_cache_saver(cache_dir)\n", - " cache_loader = ReadHandler.create_cache_loader(cache_dir)\n", - " \n", - " # Create two meshes with identical physics (will share cache entry)\n", - " shared_physics = PhysicsProperties(\n", - " mass=1.0,\n", - " friction=0.5,\n", - " inertia_tensor=np.eye(3, dtype=np.float32),\n", - " collision_points=np.array([[0, 0, 0]], dtype=np.float32)\n", - " )\n", - " \n", - " mesh1 = TexturedMesh(\n", - " vertices=vertices,\n", - " indices=indices,\n", - " texture_coords=texture_coords,\n", - " normals=normals,\n", - " material_name=\"mesh1\",\n", - " physics=shared_physics\n", - " )\n", - " \n", - " mesh2 = TexturedMesh(\n", - " vertices=vertices * 2, # Different vertices\n", - " indices=indices,\n", - " texture_coords=texture_coords,\n", - " normals=normals,\n", - " material_name=\"mesh2\",\n", - " physics=shared_physics # Same physics - will be deduplicated!\n", - " )\n", - " \n", - " # Save both meshes with the same cache\n", - " mesh1.save_to_zip(\"mesh1.zip\", cache_saver=cache_saver)\n", - " mesh2.save_to_zip(\"mesh2.zip\", cache_saver=cache_saver)\n", - " \n", - " # Check the cache - should only have 1 entry (shared physics)\n", - " cache_files = os.listdir(cache_dir)\n", - " print(f\"Cache entries: {len(cache_files)} (both meshes share the same physics cache)\")\n", - " \n", - " # Load both meshes\n", - " loaded1 = TexturedMesh.load_from_zip(\"mesh1.zip\", cache_loader=cache_loader)\n", - " loaded2 = TexturedMesh.load_from_zip(\"mesh2.zip\", cache_loader=cache_loader)\n", - " \n", - " print(f\"\\nMesh1 material: {loaded1.material_name}, physics mass: {loaded1.physics.mass}\")\n", - " print(f\"Mesh2 material: {loaded2.material_name}, physics mass: {loaded2.physics.mass}\")\n", - " \n", - " # Clean up\n", - " for f in [\"mesh1.zip\", \"mesh2.zip\"]:\n", - " if os.path.exists(f):\n", - " os.remove(f)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -704,14 +537,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Converted skinned mesh to JAX arrays, vertex dtype: float32\n" + "JAX not available - skipping conversion example\n" ] } ], @@ -734,16 +567,13 @@ "This notebook demonstrated:\n", "- Creating custom Mesh subclasses with additional numpy arrays\n", "- Working with nested dictionaries containing arrays\n", - "- Using BaseModel instances with arrays inside dictionaries\n", - "- **Nested Packables** - fields that are themselves Packable classes\n", - "- **Cache support** - using `WriteHandler.create_cache_saver()` and `ReadHandler.create_cache_loader()` for content-addressable storage\n", - "- **Deduplication** - identical nested Packables share the same cache entry" + "- Using BaseModel instances with arrays inside dictionaries" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -757,7 +587,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/python/examples/reconstruct_example.ipynb b/python/examples/reconstruct_example.ipynb new file mode 100644 index 0000000..840192a --- /dev/null +++ b/python/examples/reconstruct_example.ipynb @@ -0,0 +1,743 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ba736f7c", + "metadata": {}, + "source": [ + "# Extract and Reconstruct: Scientific Simulation Example\n", + "\n", + "This notebook demonstrates `Packable.extract()` and `reconstruct()` with a realistic scientific computing scenario:\n", + "\n", + "- A CFD simulation with mesh geometry and field data\n", + "- Nested Pydantic classes containing Packables (Mesh)\n", + "- Content-addressable storage for deduplication" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6f850881", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from meshly import Mesh, Packable\n", + "from pydantic import BaseModel, ConfigDict, Field" + ] + }, + { + "cell_type": "markdown", + "id": "d3ae1bf6", + "metadata": {}, + "source": [ + "## 1. Define Scientific Data Structures\n", + "\n", + "We'll model a CFD simulation with:\n", + "- `FieldData`: Scalar/vector field on the mesh (temperature, velocity, etc.)\n", + "- `SimulationSnapshot`: A single timestep with mesh + fields\n", + "- `SimulationCase`: Complete case with metadata and multiple snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "349483ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data structures defined\n" + ] + } + ], + "source": [ + "class FieldData(BaseModel):\n", + " \"\"\"A field defined on mesh nodes or cells.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " name: str = Field(..., description=\"Field name (e.g., 'temperature', 'velocity')\")\n", + " field_type: str = Field(..., description=\"'scalar', 'vector', or 'tensor'\")\n", + " location: str = Field(\"node\", description=\"'node' or 'cell' centered\")\n", + " data: np.ndarray = Field(..., description=\"Field values\")\n", + " units: str | None = Field(None, description=\"Physical units\")\n", + "\n", + "\n", + "class SimulationSnapshot(BaseModel):\n", + " \"\"\"A single timestep of simulation data.\n", + " \n", + " Note: This is a regular Pydantic BaseModel (not Packable) that contains\n", + " a Mesh (which IS a Packable). This tests the nested Packable extraction.\n", + " \"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " time: float = Field(..., description=\"Simulation time\")\n", + " iteration: int = Field(..., description=\"Iteration number\")\n", + " mesh: Mesh = Field(..., description=\"Computational mesh\")\n", + " fields: dict[str, FieldData] = Field(default_factory=dict, description=\"Field data\")\n", + " residuals: np.ndarray | None = Field(None, description=\"Solver residuals\")\n", + "\n", + "\n", + "class SimulationCase(BaseModel):\n", + " \"\"\"Complete simulation case with multiple snapshots.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " name: str = Field(..., description=\"Case name\")\n", + " description: str = Field(\"\", description=\"Case description\")\n", + " solver: str = Field(..., description=\"Solver name\")\n", + " parameters: dict[str, float] = Field(default_factory=dict, description=\"Solver parameters\")\n", + " snapshots: list[SimulationSnapshot] = Field(default_factory=list, description=\"Time snapshots\")\n", + "\n", + "print(\"Data structures defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcb88dff", + "metadata": {}, + "source": [ + "## 2. Create Sample Simulation Data\n", + "\n", + "Let's create a simple 2D heat transfer simulation on a quad mesh." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be109c7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created mesh: 25 vertices, 16 quads\n" + ] + } + ], + "source": [ + "# Create a simple 2D quad mesh (5x5 grid = 25 nodes, 16 quads)\n", + "nx, ny = 5, 5\n", + "x = np.linspace(0, 1, nx)\n", + "y = np.linspace(0, 1, ny)\n", + "xx, yy = np.meshgrid(x, y)\n", + "\n", + "vertices = np.column_stack([xx.ravel(), yy.ravel(), np.zeros(nx * ny)]).astype(np.float32)\n", + "\n", + "# Create quad indices\n", + "quads = []\n", + "for j in range(ny - 1):\n", + " for i in range(nx - 1):\n", + " n0 = j * nx + i\n", + " n1 = n0 + 1\n", + " n2 = n0 + nx + 1\n", + " n3 = n0 + nx\n", + " quads.append([n0, n1, n2, n3])\n", + "\n", + "indices = np.array(quads, dtype=np.uint32)\n", + "\n", + "mesh = Mesh(vertices=vertices, indices=indices)\n", + "print(f\"Created mesh: {mesh.vertex_count} vertices, {len(indices)} quads\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c7588b21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created 3 snapshots\n", + " t=0.0: ['temperature', 'velocity']\n", + " t=0.1: ['temperature', 'velocity']\n", + " t=0.2: ['temperature', 'velocity']\n" + ] + } + ], + "source": [ + "# Create simulation snapshots at different times\n", + "def create_snapshot(time: float, iteration: int, mesh: Mesh) -> SimulationSnapshot:\n", + " \"\"\"Create a snapshot with temperature and velocity fields.\"\"\"\n", + " n_nodes = mesh.vertex_count\n", + " coords = mesh.vertices[:, :2] # x, y coordinates\n", + "\n", + " # Temperature: diffusing heat from center\n", + " center = np.array([0.5, 0.5])\n", + " r = np.linalg.norm(coords - center, axis=1)\n", + " temperature = 300 + 100 * np.exp(-r**2 / (0.1 + time))\n", + "\n", + " # Velocity: rotating flow\n", + " vx = -(coords[:, 1] - 0.5)\n", + " vy = (coords[:, 0] - 0.5)\n", + " velocity = np.column_stack([vx, vy, np.zeros(n_nodes)]).astype(np.float32)\n", + "\n", + " # Residuals (solver convergence)\n", + " residuals = np.array([1e-3 / (iteration + 1), 1e-4 / (iteration + 1)], dtype=np.float32)\n", + "\n", + " return SimulationSnapshot(\n", + " time=time,\n", + " iteration=iteration,\n", + " mesh=mesh,\n", + " fields={\n", + " \"temperature\": FieldData(\n", + " name=\"temperature\",\n", + " field_type=\"scalar\",\n", + " location=\"node\",\n", + " data=temperature.astype(np.float32),\n", + " units=\"K\"\n", + " ),\n", + " \"velocity\": FieldData(\n", + " name=\"velocity\",\n", + " field_type=\"vector\",\n", + " location=\"node\",\n", + " data=velocity,\n", + " units=\"m/s\"\n", + " )\n", + " },\n", + " residuals=residuals\n", + " )\n", + "\n", + "# Create snapshots at t=0, 0.1, 0.2\n", + "snapshots = [\n", + " create_snapshot(0.0, 0, mesh),\n", + " create_snapshot(0.1, 100, mesh),\n", + " create_snapshot(0.2, 200, mesh),\n", + "]\n", + "\n", + "print(f\"Created {len(snapshots)} snapshots\")\n", + "for s in snapshots:\n", + " print(f\" t={s.time}: {list(s.fields.keys())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "93568d04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simulation case: heat_transfer_2d\n", + " Solver: simpleFoam\n", + " Parameters: {'dt': 0.001, 'nu': 1e-05, 'alpha': 0.0001}\n", + " Snapshots: 3\n" + ] + } + ], + "source": [ + "# Create the complete simulation case\n", + "case = SimulationCase(\n", + " name=\"heat_transfer_2d\",\n", + " description=\"2D heat transfer with rotating flow\",\n", + " solver=\"simpleFoam\",\n", + " parameters={\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-5,\n", + " \"alpha\": 1e-4,\n", + " },\n", + " snapshots=snapshots\n", + ")\n", + "\n", + "print(f\"Simulation case: {case.name}\")\n", + "print(f\" Solver: {case.solver}\")\n", + "print(f\" Parameters: {case.parameters}\")\n", + "print(f\" Snapshots: {len(case.snapshots)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c7048da", + "metadata": {}, + "source": [ + "## 3. Extract the Simulation Data\n", + "\n", + "`Packable.extract()` recursively processes the nested structure:\n", + "- Arrays → `{\"$ref\": checksum, \"$type\": \"array\"}`\n", + "- Nested Mesh (Packable) → `{\"$ref\": checksum, \"$type\": \"packable\", ...}`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "95533188", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data keys: ['name', 'description', 'solver', 'parameters', 'snapshots']\n", + "\n", + "Total assets: 8\n", + "\n", + "Asset sizes:\n", + " 4e71a79c2d0fa381: 1,467 bytes\n", + " 28dc719a0c8c1387: 200 bytes\n", + " 59ffdd6bfac7876a: 250 bytes\n", + " 0c345962a52e7e2c: 133 bytes\n", + " 292cfc23f6777b02: 200 bytes\n", + " 17b38a2f2cbdd0a7: 133 bytes\n", + " 145838c08771e6ef: 201 bytes\n", + " ea37b2590dba4b31: 132 bytes\n" + ] + } + ], + "source": [ + "# Extract the entire simulation case\n", + "extracted = Packable.extract(case)\n", + "\n", + "print(f\"Extracted data keys: {list(extracted.data.keys())}\")\n", + "print(f\"\\nTotal assets: {len(extracted.assets)}\")\n", + "print(\"\\nAsset sizes:\")\n", + "for checksum, data in extracted.assets.items():\n", + " print(f\" {checksum}: {len(data):,} bytes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba82742d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data structure:\n", + "{\n", + " \"name\": \"heat_transfer_2d\",\n", + " \"description\": \"2D heat transfer with rotating flow\",\n", + " \"solver\": \"simpleFoam\",\n", + " \"parameters\": {\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-05,\n", + " \"alpha\": 0.0001\n", + " },\n", + " \"snapshots\": [\n", + " {\n", + " \"time\": 0.0,\n", + " \"iteration\": 0,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"28dc719a0c8c1387\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"0c345962a52e7e2c\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.1,\n", + " \"iteration\": 100,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"292cfc23f6777b02\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"17b38a2f2cbdd0a7\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.2,\n", + " \"iteration\": 200,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"145838c08771e6ef\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \n", + "...\n" + ] + } + ], + "source": [ + "# Examine the extracted data structure\n", + "import json\n", + "\n", + "# Pretty print the extracted data (it's JSON-serializable!)\n", + "print(\"Extracted data structure:\")\n", + "print(json.dumps(extracted.data, indent=2)[:2000] + \"\\n...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6977cb53", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh reference: {'$ref': '4e71a79c2d0fa381'}\n" + ] + } + ], + "source": [ + "# Look at the first snapshot's mesh reference\n", + "mesh_ref = extracted.data[\"snapshots\"][0][\"mesh\"]\n", + "print(f\"Mesh reference: {mesh_ref}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "bc82716a", + "metadata": {}, + "source": [ + "## 4. Asset Deduplication\n", + "\n", + "Since all snapshots share the same mesh, it's only stored once!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a251ef65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh checksums: ['4e71a79c2d0fa381', '4e71a79c2d0fa381', '4e71a79c2d0fa381']\n", + "\n", + "All same? True\n", + "\n", + "The mesh is stored only ONCE in assets, saving 2,934 bytes!\n" + ] + } + ], + "source": [ + "# Check mesh references across snapshots\n", + "mesh_refs = [s[\"mesh\"][\"$ref\"] for s in extracted.data[\"snapshots\"]]\n", + "print(f\"Mesh checksums: {mesh_refs}\")\n", + "print(f\"\\nAll same? {len(set(mesh_refs)) == 1}\")\n", + "print(f\"\\nThe mesh is stored only ONCE in assets, saving {(len(mesh_refs)-1) * len(extracted.assets[mesh_refs[0]]):,} bytes!\")" + ] + }, + { + "cell_type": "markdown", + "id": "b732526c", + "metadata": {}, + "source": [ + "## 5. Reconstruct back to SimulationCase" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5c3761f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Reconstructed case: heat_transfer_2d with 3 snapshots\n", + "Decoded mesh from reconstructed case: 25 vertices, 64 indices\n" + ] + } + ], + "source": [ + "reconstructed_case = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets)\n", + "print(f\"\\nReconstructed case: {reconstructed_case.name} with {len(reconstructed_case.snapshots)} snapshots\")\n", + "\n", + "decoded_mesh = Mesh.decode(reconstructed_case.snapshots[0].mesh.encode())\n", + "print(f\"Decoded mesh from reconstructed case: {decoded_mesh.vertex_count} vertices, {len(decoded_mesh.indices)} indices\")" + ] + }, + { + "cell_type": "markdown", + "id": "ccaf56b9", + "metadata": {}, + "source": [ + "## 6. Lazy Loading with CachedAssetLoader\n", + "\n", + "When working with large datasets, you may want to:\n", + "- Load assets on-demand (lazy loading)\n", + "- Cache fetched assets to disk for subsequent runs\n", + "\n", + "`Packable.reconstruct()` supports this via `CachedAssetLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ac9c08e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Lazy loading with callable ===\n", + "\n", + "LazyModel created, no assets fetched yet. Fetch count: 0\n", + "Type: \n", + "\n", + "Case name: heat_transfer_2d\n", + "Fetch count after accessing name: 0\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "from meshly.data_handler import DataHandler\n", + "from meshly.packable import CachedAssetLoader\n", + "\n", + "# Simulate fetching assets from remote storage\n", + "fetch_count = [0]\n", + "\n", + "def fetch_from_storage(checksum: str) -> bytes:\n", + " \"\"\"Simulate fetching from cloud/remote storage.\"\"\"\n", + " fetch_count[0] += 1\n", + " print(f\" Fetching asset {checksum[:8]}... (fetch #{fetch_count[0]})\")\n", + " return extracted.assets[checksum]\n", + "\n", + "# Using a plain callable - lazy loading, assets fetched on field access\n", + "print(\"=== Lazy loading with callable ===\")\n", + "lazy_case = Packable.reconstruct(SimulationCase, extracted.data, fetch_from_storage)\n", + "\n", + "print(f\"\\nLazyModel created, no assets fetched yet. Fetch count: {fetch_count[0]}\")\n", + "print(f\"Type: {type(lazy_case)}\")\n", + "\n", + "# Access primitive fields - no fetch needed\n", + "print(f\"\\nCase name: {lazy_case.name}\")\n", + "print(f\"Fetch count after accessing name: {fetch_count[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "38bd4003", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Accessing first snapshot ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 4e71a79c... (fetch #5)\n", + " Fetching asset 292cfc23... (fetch #6)\n", + " Fetching asset 59ffdd6b... (fetch #7)\n", + " Fetching asset 17b38a2f... (fetch #8)\n", + " Fetching asset 4e71a79c... (fetch #9)\n", + " Fetching asset 145838c0... (fetch #10)\n", + " Fetching asset 59ffdd6b... (fetch #11)\n", + " Fetching asset ea37b259... (fetch #12)\n", + "Fetch count after accessing snapshots: 12\n", + "\n", + "Snapshot time: 0.0\n", + "Mesh vertices shape: (25, 3)\n", + "\n", + "=== Resolving to full model ===\n", + "Final fetch count: 12\n", + "Resolved type: \n" + ] + } + ], + "source": [ + "# Access a snapshot - this triggers fetching of nested assets\n", + "print(\"=== Accessing first snapshot ===\")\n", + "snapshot = lazy_case.snapshots[0]\n", + "print(f\"Fetch count after accessing snapshots: {fetch_count[0]}\")\n", + "\n", + "# The mesh is fetched when we access it\n", + "print(f\"\\nSnapshot time: {snapshot.time}\")\n", + "print(f\"Mesh vertices shape: {snapshot.mesh.vertices.shape}\")\n", + "\n", + "# To fully resolve and get the actual Pydantic model:\n", + "print(\"\\n=== Resolving to full model ===\")\n", + "resolved_case = lazy_case.resolve()\n", + "print(f\"Final fetch count: {fetch_count[0]}\")\n", + "print(f\"Resolved type: {type(resolved_case)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "46d7b7c0", + "metadata": {}, + "source": [ + "### CachedAssetLoader: Persistent Disk Cache\n", + "\n", + "For repeated access, use `CachedAssetLoader` to cache fetched assets to disk:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "88d9c7be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== First run: fetching and caching ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 292cfc23... (fetch #5)\n", + " Fetching asset 17b38a2f... (fetch #6)\n", + " Fetching asset 145838c0... (fetch #7)\n", + " Fetching asset ea37b259... (fetch #8)\n", + "Assets fetched: 8\n", + "\n", + "=== Second run: reading from cache ===\n", + "Assets fetched from remote: 0 (all served from cache!)\n", + "Resolved case: heat_transfer_2d with 3 snapshots\n" + ] + } + ], + "source": [ + "import tempfile\n", + "\n", + "# Reset fetch counter\n", + "fetch_count[0] = 0\n", + "\n", + "with tempfile.TemporaryDirectory() as tmpdir:\n", + " cache_path = Path(tmpdir) / \"asset_cache\"\n", + "\n", + " # Create cache handler and loader\n", + " cache_handler = DataHandler.create(cache_path)\n", + " loader = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler)\n", + "\n", + " print(\"=== First run: fetching and caching ===\")\n", + " lazy1 = Packable.reconstruct(SimulationCase, extracted.data, loader)\n", + " _ = lazy1.resolve() # Fetch all assets\n", + " print(f\"Assets fetched: {fetch_count[0]}\")\n", + "\n", + " # Finalize to persist cache\n", + " cache_handler.finalize()\n", + "\n", + " # Second run with same cache location\n", + " print(\"\\n=== Second run: reading from cache ===\")\n", + " fetch_count[0] = 0\n", + " cache_handler2 = DataHandler.create(cache_path)\n", + " loader2 = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler2)\n", + "\n", + " lazy2 = Packable.reconstruct(SimulationCase, extracted.data, loader2)\n", + " resolved2 = lazy2.resolve()\n", + " print(f\"Assets fetched from remote: {fetch_count[0]} (all served from cache!)\")\n", + " print(f\"Resolved case: {resolved2.name} with {len(resolved2.snapshots)} snapshots\")" + ] + }, + { + "cell_type": "markdown", + "id": "1a54dcde", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "`Packable.extract()` is a **static method** that handles:\n", + "\n", + "| Input | Handling |\n", + "|-------|----------|\n", + "| Top-level Packable | Expands fields, arrays → refs |\n", + "| Nested Packable (in dict/list/BaseModel) | Becomes `{\"$ref\": checksum}` |\n", + "| NumPy arrays | Becomes `{\"$ref\": checksum}` |\n", + "| BaseModel | Recursively extracts fields |\n", + "| Primitives | Passed through unchanged |\n", + "\n", + "`Packable.reconstruct()` supports three modes:\n", + "\n", + "| AssetProvider | Result | Use Case |\n", + "|--------------|--------|----------|\n", + "| `Dict[str, bytes]` | `TModel` | Eager loading, all assets in memory |\n", + "| `AssetFetcher` | `LazyModel[TModel]` | Lazy per-field loading |\n", + "| `CachedAssetLoader` | `LazyModel[TModel]` | Lazy loading with disk cache |\n", + "\n", + "Key benefits for scientific computing:\n", + "- **Deduplication**: Shared meshes/arrays stored once\n", + "- **Lazy loading**: Load only the fields you need with `LazyModel`\n", + "- **Persistent caching**: `CachedAssetLoader` caches fetched assets to disk\n", + "- **JSON metadata**: Easy to query/index simulation cases\n", + "- **Version control friendly**: Small metadata files, large binary assets" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/meshly/__init__.py b/python/meshly/__init__.py index ab5ed42..a3c8d8d 100644 --- a/python/meshly/__init__.py +++ b/python/meshly/__init__.py @@ -16,6 +16,7 @@ from .packable import ( Packable, PackableMetadata, + SerializedPackableData, ) from .mesh import ( @@ -42,6 +43,8 @@ ) from .data_handler import ( + AssetProvider, + CachedAssetLoader, DataHandler, ) @@ -50,8 +53,11 @@ # Packable base class "Packable", "PackableMetadata", + "SerializedPackableData", "ArrayType", # Data handlers + "AssetProvider", + "CachedAssetLoader", "DataHandler", # Mesh classes "Mesh", diff --git a/python/meshly/array.py b/python/meshly/array.py index 0aefe2c..54a85d0 100644 --- a/python/meshly/array.py +++ b/python/meshly/array.py @@ -5,15 +5,16 @@ encoding functions and storing/loading them as encoded data. """ import ctypes -from io import BytesIO import json -from typing import Any, Dict, List, Literal, Optional, Union +from io import BytesIO +from typing import Any, Literal, Optional, Union + import numpy as np -from pydantic import BaseModel, Field from meshoptimizer._loader import lib +from pydantic import BaseModel, Field -from .data_handler import DataHandler, ZipBuffer from .common import PathLike +from .data_handler import DataHandler # Optional JAX support try: @@ -37,7 +38,7 @@ class ArrayMetadata(BaseModel): Used in zip files to store array metadata. """ - shape: List[int] = Field(..., description="Shape of the array") + shape: list[int] = Field(..., description="Shape of the array") dtype: str = Field(..., description="Data type of the array as string") itemsize: int = Field(..., description="Size of each item in bytes") array_type: ArrayType = Field( @@ -133,57 +134,59 @@ def convert_recursive(obj, array_type: ArrayType): return obj @staticmethod - def extract_nested_arrays(obj, prefix: str = "") -> Dict[str, Array]: + def extract_nested_arrays( + obj, + prefix: str = "", + skip: Optional[callable] = None, + ) -> dict[str, Array]: """Recursively extract arrays from nested dicts and BaseModel instances. - Note: Packable instances are skipped - they handle their own encoding. + Args: + obj: Object to extract arrays from + prefix: Path prefix for nested keys + skip: Optional predicate - if skip(obj) is True, skip this object """ - from pydantic import BaseModel - from .packable import Packable arrays = {} - if ArrayUtils.is_array(obj): - arrays[prefix] = obj - elif isinstance(obj, Packable): - # Skip Packable instances - they encode themselves + if skip and skip(obj): pass + elif ArrayUtils.is_array(obj): + arrays[prefix] = obj elif isinstance(obj, BaseModel): for name in type(obj).model_fields: value = getattr(obj, name, None) if value is not None: key = f"{prefix}.{name}" if prefix else name - arrays.update(ArrayUtils.extract_nested_arrays(value, key)) + arrays.update(ArrayUtils.extract_nested_arrays(value, key, skip)) elif isinstance(obj, dict): for k, v in obj.items(): key = f"{prefix}.{k}" if prefix else k - arrays.update(ArrayUtils.extract_nested_arrays(v, key)) + arrays.update(ArrayUtils.extract_nested_arrays(v, key, skip)) return arrays @staticmethod - def extract_non_arrays(obj): - """Extract non-array values, preserving BaseModel type info for reconstruction. + def extract_non_arrays(obj, skip: Optional[callable] = None): + """Extract non-array values from nested structures. - Note: Packable instances are skipped - they handle their own encoding. + Args: + obj: Object to extract non-arrays from + skip: Optional predicate - if skip(obj) is True, skip this object """ - from pydantic import BaseModel - from .packable import Packable if ArrayUtils.is_array(obj): return None - if isinstance(obj, Packable): - # Skip Packable instances - they encode themselves + if skip and skip(obj): return None if isinstance(obj, BaseModel): - result = {"__model_class__": obj.__class__.__name__, - "__model_module__": obj.__class__.__module__} + result = {} for name in type(obj).model_fields: val = getattr(obj, name, None) - if not ArrayUtils.is_array(val) and not isinstance(val, Packable): - extracted = ArrayUtils.extract_non_arrays(val) + if not ArrayUtils.is_array(val) and not (skip and skip(val)): + extracted = ArrayUtils.extract_non_arrays(val, skip) if extracted is not None: result[name] = extracted - return result if len(result) > 2 else None + return result or None if isinstance(obj, dict): - result = {k: ArrayUtils.extract_non_arrays(v) for k, v in obj.items() - if not ArrayUtils.is_array(v) and not isinstance(v, Packable)} + result = {k: ArrayUtils.extract_non_arrays(v, skip) for k, v in obj.items() + if not ArrayUtils.is_array(v) and not (skip and skip(v))} result = {k: v for k, v in result.items() if v is not None} return result or None return obj @@ -374,8 +377,8 @@ def save_to_zip( """ encoded = ArrayUtils.encode_array(array) - zip_buffer = ZipBuffer() - handler = WriteHandler.create_handler(zip_buffer) + zip_buffer = BytesIO() + handler = DataHandler.create(zip_buffer) ArrayUtils.save_array(handler, "array", encoded) handler.finalize() @@ -403,9 +406,9 @@ def load_from_zip( """ if isinstance(source, BytesIO): source.seek(0) - handler = DataHandler.create(ZipBuffer(source.read())) + handler = DataHandler.create(BytesIO(source.read())) else: with open(source, "rb") as f: - handler = DataHandler.create(ZipBuffer(f.read())) + handler = DataHandler.create(BytesIO(f.read())) return ArrayUtils.load_array(handler, "array", array_type) diff --git a/python/meshly/data_handler.py b/python/meshly/data_handler.py index cb3c515..ec7d5e2 100644 --- a/python/meshly/data_handler.py +++ b/python/meshly/data_handler.py @@ -1,18 +1,56 @@ import stat -from typing import Callable, List, Optional, Union +from dataclasses import dataclass +from typing import Awaitable, Callable, Dict, List, Optional, Union import zipfile from io import BytesIO from pathlib import Path from abc import abstractmethod from .common import PathLike +HandlerSource = Union[PathLike, BytesIO] -ZipBuffer = BytesIO +# Type for asset provider: either a dict or a callable that fetches by checksum +# Supports both sync and async fetch functions +# The callable can return None to indicate the asset should be read from cache +AssetFetcher = Callable[[str], Union[bytes, None, Awaitable[Optional[bytes]]]] +AssetProvider = Union[Dict[str, bytes], AssetFetcher, "CachedAssetLoader"] -HandlerSource = Union[PathLike, ZipBuffer] +@dataclass +class CachedAssetLoader: + """Asset loader with optional disk cache for persistence. + + Wraps a callable asset fetcher with a DataHandler for caching. + Fetched assets are stored as 'assets/{checksum}.bin' and read + from cache on subsequent access. + + The fetch callable can return None to indicate the asset is not + available from the remote source, in which case the loader will + attempt to read from the cache. If not in cache either, a KeyError + is raised. + + Example: + def fetch_from_cloud(checksum: str) -> bytes | None: + try: + return cloud_storage.download(checksum) + except NotFoundError: + return None # Will fallback to cache + + # Create loader with disk cache + cache = DataHandler.create(Path("./cache")) + loader = CachedAssetLoader(fetch_from_cloud, cache) + + lazy = Packable.reconstruct(SimulationCase, data, loader) + """ + fetch: AssetFetcher + """Callable that fetches asset bytes by checksum (can return None to use cache)""" + cache: "DataHandler" + """DataHandler for caching fetched assets""""" + + +class DataHandler: + """Protocol for reading and writing files to various sources.""" -class BaseDataHandler: rel_path: str def resolved_path(self, subpath: PathLike) -> Path: @@ -21,10 +59,6 @@ def resolved_path(self, subpath: PathLike) -> Path: return Path(str(subpath)) return Path(f"{self.rel_path}/{subpath}") - -class DataHandler(BaseDataHandler): - """Protocol for reading and writing files to various sources.""" - def __init__(self, source: HandlerSource, rel_path=""): self.source = source self.rel_path = rel_path @@ -32,34 +66,39 @@ def __init__(self, source: HandlerSource, rel_path=""): @abstractmethod def read_text(self, subpath: PathLike, encoding: str = "utf-8") -> str: """Read text content from a file.""" - ... + raise NotImplementedError @abstractmethod def read_binary(self, subpath: PathLike) -> bytes: """Read binary content from a file.""" - ... + raise NotImplementedError @abstractmethod def write_text(self, subpath: PathLike, content: str, executable: bool = False) -> None: """Write text content to a file.""" - ... + raise NotImplementedError @abstractmethod def write_binary(self, subpath: PathLike, content: Union[bytes, BytesIO], executable: bool = False) -> None: """Write binary content to a file.""" - ... + raise NotImplementedError @abstractmethod def list_files(self, subpath: PathLike = "", recursive: bool = False) -> List[Path]: """List files in the given subpath.""" - ... + raise NotImplementedError @abstractmethod def exists(self, subpath: PathLike) -> bool: """Check if a file exists.""" - ... + raise NotImplementedError + + @abstractmethod + def remove_file(self, subpath: PathLike) -> None: + """Remove a file.""" + raise NotImplementedError - def to_path(self, rel_path: str): + def to_path(self, rel_path: str) -> "DataHandler": """Get a handler with a nested relative path.""" return DataHandler.create(self.source, f"{self.rel_path}/{rel_path}" if self.rel_path != "" else rel_path, self) @@ -75,7 +114,7 @@ def create(source: HandlerSource, rel_path="", existing_handler: Optional["DataH Returns: Handler implementation """ - if isinstance(source, ZipBuffer): + if isinstance(source, BytesIO): return ZipHandler( source, rel_path, @@ -89,6 +128,15 @@ def finalize(self): """Close any resources if needed.""" pass + def __enter__(self): + """Enter context manager.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Exit context manager, calling finalize().""" + self.finalize() + return False + class FileHandler(DataHandler): """Handler for reading and writing files on the regular file system.""" @@ -141,6 +189,11 @@ def exists(self, subpath: PathLike) -> bool: full_path = self.source / self.resolved_path(subpath) return full_path.exists() + def remove_file(self, subpath: PathLike) -> None: + full_path = self.source / self.resolved_path(subpath) + if full_path.exists(): + full_path.unlink() + class ZipHandler(DataHandler): """Handler for reading and writing files in zip archives.""" @@ -218,10 +271,12 @@ def exists(self, subpath: PathLike) -> bool: except KeyError: return False + def remove_file(self, subpath: PathLike) -> None: + # Note: zipfile doesn't support removing files directly. + # This would require recreating the zip without the file. + raise NotImplementedError("ZipHandler does not support removing files") + def finalize(self): """Close the zip file.""" if hasattr(self, 'zip_file') and self.zip_file: self.zip_file.close() - - -ZipBuffer = BytesIO diff --git a/python/meshly/packable.py b/python/meshly/packable.py index 0d3fe0f..0e34997 100644 --- a/python/meshly/packable.py +++ b/python/meshly/packable.py @@ -6,64 +6,168 @@ Custom data classes can inherit from Packable to store simulation results, time-series data, or any structured data with numpy arrays. + +Packables cannot contain nested Packables. For composite structures, +use the extract() and reconstruct() methods to handle asset management. """ -import hashlib import json +from collections.abc import Callable from dataclasses import dataclass from io import BytesIO -from typing import ( - Callable, - Dict, - Generic, - Optional, - Set, - Type, - Any, - TypeVar, - Union, -) +from pathlib import Path +from typing import Any, Generic, TypeVar, Union + from pydantic import BaseModel, Field -from .array import ArrayUtils, ArrayType, Array + +from .array import Array, ArrayType, ArrayUtils from .common import PathLike -from .data_handler import DataHandler, ZipBuffer +from .data_handler import AssetProvider, CachedAssetLoader, DataHandler +from .utils.checksum_utils import ChecksumUtils +from .utils.schema_utils import SchemaUtils +from .utils.serialization_utils import SerializationUtils + +TModel = TypeVar("TModel", bound=BaseModel) class PackableMetadata(BaseModel): """Metadata for a Packable saved to zip.""" - class_name: str = Field(..., description="Name of the data class") - module_name: str = Field(..., - description="Module containing the data class") - field_data: Dict[str, Any] = Field( - default_factory=dict, description="Non-array field values") - packable_refs: Dict[str, str] = Field( - default_factory=dict, - description="SHA256 hashes for cached packable fields (field_name -> hash)" - ) + + field_data: dict[str, Any] = Field(default_factory=dict, description="Non-array field values") TPackableMetadata = TypeVar("TPackableMetadata", bound=PackableMetadata) TPackable = TypeVar("TPackable", bound="Packable") -FieldValue = TypeVar("FieldValue") # Value type for custom fields +FieldValue = TypeVar("FieldValue") + + +@dataclass +class SerializedPackableData: + """Result of extracting a Packable for serialization. + + Contains the serializable data dict with checksum references, + plus the encoded assets (arrays as bytes). + """ + + data: dict[str, Any] + """Serializable dict with primitive fields and checksum refs for arrays""" + assets: dict[str, bytes] + """Map of checksum -> encoded bytes for all arrays""" + + +class LazyModel(Generic[TModel]): + """Lazy proxy for a Pydantic BaseModel that defers asset loading until field access. + + Example: + def fetch_asset(checksum: str) -> bytes: + return cloud_storage.download(checksum) + + lazy = Packable.reconstruct(SimulationCase, data, fetch_asset) + # No assets loaded yet + + temp = lazy.temperature # NOW the temperature asset is fetched + vel = lazy.velocity # NOW the velocity asset is fetched + """ + + __slots__ = ("_model_class", "_data", "_assets", "_array_type", "_cache", "_resolved") + + def __init__( + self, + model_class: type[TModel], + data: dict[str, Any], + assets: AssetProvider, + array_type: ArrayType | None = None, + ): + object.__setattr__(self, "_model_class", model_class) + object.__setattr__(self, "_data", data) + object.__setattr__(self, "_assets", assets) + object.__setattr__(self, "_array_type", array_type) + object.__setattr__(self, "_cache", {}) + object.__setattr__(self, "_resolved", None) + + def _get_cached_asset(self, checksum: str) -> bytes: + """Get asset bytes, using cache if CachedAssetLoader is provided.""" + return SerializationUtils.get_cached_asset( + object.__getattribute__(self, "_assets"), checksum + ) + + def __getattr__(self, name: str) -> Any: + cache = object.__getattribute__(self, "_cache") + if name in cache: + return cache[name] + + model_class = object.__getattribute__(self, "_model_class") + data = object.__getattribute__(self, "_data") + array_type = object.__getattribute__(self, "_array_type") + + if name not in model_class.model_fields: + raise AttributeError(f"'{model_class.__name__}' has no attribute '{name}'") + + if name not in data: + return None + + field_value = data[name] + field_type = model_class.model_fields[name].annotation + + resolved = SchemaUtils.resolve_value_with_type( + field_value, field_type, self._get_cached_asset, array_type + ) + + cache[name] = resolved + return resolved + + def __setattr__(self, name: str, value: Any) -> None: + raise AttributeError("LazyModel is read-only. Use resolve() to get a mutable model.") + + def resolve(self) -> TModel: + """Fully resolve all fields and return the actual Pydantic model.""" + resolved = object.__getattribute__(self, "_resolved") + if resolved is not None: + return resolved + + model_class = object.__getattribute__(self, "_model_class") + data = object.__getattribute__(self, "_data") + array_type = object.__getattribute__(self, "_array_type") + cache = object.__getattribute__(self, "_cache") + + resolved_data = {} + for field_name, field_info in model_class.model_fields.items(): + if field_name in cache: + resolved_data[field_name] = cache[field_name] + elif field_name in data: + resolved_data[field_name] = SchemaUtils.resolve_value_with_type( + data[field_name], field_info.annotation, self._get_cached_asset, array_type + ) + + result = model_class(**resolved_data) + object.__setattr__(self, "_resolved", result) + return result + + def __repr__(self) -> str: + model_class = object.__getattribute__(self, "_model_class") + cache = object.__getattribute__(self, "_cache") + data = object.__getattribute__(self, "_data") + loaded = list(cache.keys()) + pending = [k for k in data.keys() if k not in cache] + return f"LazyModel[{model_class.__name__}](loaded={loaded}, pending={pending})" @dataclass class CustomFieldConfig(Generic[FieldValue, TPackableMetadata]): """Configuration for custom field encoding/decoding.""" + file_name: str """File name in zip (without .bin extension)""" encode: Callable[[FieldValue, Any], bytes] """Encoder function: (value, instance) -> bytes""" - decode: Callable[[bytes, TPackableMetadata, - Optional[ArrayType]], FieldValue] + decode: Callable[[bytes, TPackableMetadata, ArrayType | None], FieldValue] """Decoder function: (bytes, metadata, array_type) -> value""" optional: bool = False """Whether the field is optional (won't throw if missing)""" class Packable(BaseModel): - """ - Base class for data containers with automatic array serialization. + """Base class for data containers with automatic array serialization. Subclasses can define numpy array attributes which will be automatically detected, encoded, and saved to zip files. Non-array fields are preserved @@ -87,8 +191,25 @@ class SimulationResult(Packable): class Config: arbitrary_types_allowed = True + def __init__(self, **data): + super().__init__(**data) + self._validate_no_direct_packable_fields() + + def _validate_no_direct_packable_fields(self) -> None: + """Validate that this Packable has no direct Packable fields.""" + for field_name in type(self).model_fields: + if field_name in self.__private_attributes__: + continue + value = getattr(self, field_name, None) + if value is not None and isinstance(value, Packable): + raise TypeError( + f"Direct Packable fields are not allowed. Field '{field_name}' " + f"contains a {type(value).__name__}. Packables can be nested " + "inside dicts or other BaseModels, and extract() will handle them." + ) + @property - def array_fields(self) -> Set[str]: + def array_fields(self) -> set[str]: """Get all array field paths, including nested arrays in dicts/BaseModels.""" result = set() for field_name in type(self).model_fields: @@ -96,12 +217,15 @@ def array_fields(self) -> Set[str]: continue value = getattr(self, field_name, None) if value is not None: - result.update(ArrayUtils.extract_nested_arrays( - value, field_name).keys()) + result.update( + ArrayUtils.extract_nested_arrays( + value, field_name, skip=lambda x: isinstance(x, Packable) + ).keys() + ) return result - def _extract_non_array_fields(self) -> Dict[str, Any]: - """Extract non-array field values for metadata, preserving BaseModel type info.""" + def _extract_non_array_fields(self) -> dict[str, Any]: + """Extract non-array field values for metadata.""" model_data = {} direct_arrays = {f for f in self.array_fields if "." not in f} for name in type(self).model_fields: @@ -109,191 +233,84 @@ def _extract_non_array_fields(self) -> Dict[str, Any]: continue value = getattr(self, name, None) if value is not None and not ArrayUtils.is_array(value): - extracted = ArrayUtils.extract_non_arrays(value) + extracted = ArrayUtils.extract_non_arrays( + value, skip=lambda x: isinstance(x, Packable) + ) if extracted is not None: model_data[name] = extracted return model_data - def _create_metadata(self, field_data: Dict[str, Any]) -> PackableMetadata: - """ - Create metadata for this Packable. - - Subclasses can override this to return custom metadata types. - - Args: - field_data: Non-array field values to include in metadata - - Returns: - PackableMetadata (or subclass) instance - """ - return PackableMetadata( - class_name=self.__class__.__name__, - module_name=self.__class__.__module__, - field_data=field_data, - ) + def _create_metadata(self, field_data: dict[str, Any]) -> PackableMetadata: + """Create metadata for this Packable. Subclasses can override.""" + return PackableMetadata(field_data=field_data) @classmethod def load_metadata( - cls, - handler: DataHandler, - metadata_cls: Type[TPackableMetadata] = PackableMetadata + cls, handler: DataHandler, metadata_cls: type[TPackableMetadata] = PackableMetadata ) -> TPackableMetadata: - """ - Load and validate metadata using a read handler. - - Args: - handler: ReadHandler for reading files - metadata_cls: The metadata class to use for parsing (default: PackableMetadata) - - Returns: - Metadata object of the specified type - - Raises: - ValueError: If class name doesn't match - """ + """Load and validate metadata using a read handler.""" metadata_text = handler.read_text("metadata.json") metadata_dict = json.loads(metadata_text) - metadata = metadata_cls(**metadata_dict) - - if metadata.class_name != cls.__name__ or metadata.module_name != cls.__module__: - raise ValueError( - f"Class mismatch: expected {cls.__name__} but got {metadata.class_name} from {metadata.module_name}" - ) + return metadata_cls(**metadata_dict) - return metadata - - def save_to_zip( - self, - destination: Union[PathLike, BytesIO], - cache_handler: Optional[DataHandler] = None, - ) -> None: - """ - Save this container to a zip file. - - Args: - destination: Path to the output zip file or BytesIO buffer - cache_handler: Optional DataHandler for caching nested Packables. - When provided, nested Packable fields are saved via - cache_handler.write_binary() and only hash - references are stored in the parent zip. This enables - deduplication and smaller parent files. - """ - encoded = self.encode(cache_handler=cache_handler) + def save_to_zip(self, destination: Union[PathLike, BytesIO]) -> None: + """Save this container to a zip file.""" + encoded = self.encode() if isinstance(destination, BytesIO): destination.write(encoded) else: - with open(destination, "wb") as f: - f.write(encoded) + Path(destination).write_bytes(encoded) @classmethod def load_from_zip( - cls: Type[TPackable], + cls: type[TPackable], source: Union[PathLike, BytesIO], - array_type: Optional[ArrayType] = None, - cache_handler: Optional[DataHandler] = None, + array_type: ArrayType | None = None, ) -> TPackable: - """ - Load a Packable from a zip file. - - Args: - source: Path to the input zip file or BytesIO object - array_type: Array backend to use ("numpy" or "jax"). If None (default), - uses the array_type stored in each array's metadata, - preserving the original array types that were saved. - cache_handler: Optional Handler to load nested Packables from cache. - When the zip contains hash references (packable_refs), - cache_handler.read_binary() is called to retrieve - cached bytes. - - Returns: - Loaded Packable instance - """ + """Load a Packable from a zip file.""" if isinstance(source, BytesIO): source.seek(0) - return cls.decode(source.read(), array_type, cache_handler) + return cls.decode(source.read(), array_type) else: with open(source, "rb") as f: - return cls.decode(f.read(), array_type, cache_handler) + return cls.decode(f.read(), array_type) @classmethod - def _get_custom_fields(cls) -> Dict[str, CustomFieldConfig]: - """ - Get custom field configurations for this class. - - Subclasses override this to define custom encoders/decoders. - - Returns: - Dict mapping field names to CustomFieldConfig objects - """ + def _get_custom_fields(cls) -> dict[str, CustomFieldConfig]: + """Get custom field configurations. Subclasses override this.""" return {} @classmethod - def _get_custom_field_names(cls) -> Set[str]: + def _get_custom_field_names(cls) -> set[str]: """Get set of field names that have custom encoding/decoding.""" return set(cls._get_custom_fields().keys()) - def _get_packable_fields(self) -> Dict[str, "Packable"]: - """Get fields that are Packable instances (excluding self).""" - packable_fields = {} - for field_name in type(self).model_fields: - if field_name in self.__private_attributes__: - continue - value = getattr(self, field_name, None) - if value is not None and isinstance(value, Packable): - packable_fields[field_name] = value - return packable_fields - - def _get_packable_field_names(self) -> Set[str]: - """Get set of field names that are Packable instances.""" - return set(self._get_packable_fields().keys()) - - @classmethod - def _get_packable_field_types(cls) -> Set[str]: - """Get field names that are Packable types from type hints (for decoding).""" - import typing - hints = typing.get_type_hints(cls) - packable_fields = set() - - for field_name, field_type in hints.items(): - # Handle Optional[PackableSubclass] - origin = typing.get_origin(field_type) - if origin is Union: - args = typing.get_args(field_type) - for arg in args: - if isinstance(arg, type) and issubclass(arg, Packable): - packable_fields.add(field_name) - break - elif isinstance(field_type, type) and issubclass(field_type, Packable): - packable_fields.add(field_name) - - return packable_fields - @classmethod def _decode_custom_fields( cls, handler: DataHandler, metadata: PackableMetadata, - data: Dict[str, Any], - array_type: Optional[ArrayType] = None + data: dict[str, Any], + array_type: ArrayType | None = None, ) -> None: """Decode fields with custom decoders.""" for field_name, config in cls._get_custom_fields().items(): try: encoded_bytes = handler.read_binary(f"{config.file_name}.bin") - data[field_name] = config.decode( - encoded_bytes, metadata, array_type) + data[field_name] = config.decode(encoded_bytes, metadata, array_type) except (KeyError, FileNotFoundError): if not config.optional: raise ValueError( - f"Required custom field '{field_name}' ({config.file_name}.bin) not found in zip") + f"Required custom field '{field_name}' ({config.file_name}.bin) not found" + ) @classmethod def _load_standard_arrays( cls, handler: DataHandler, - data: Dict[str, Any], - skip_fields: Set[str], - array_type: Optional[ArrayType] = None + data: dict[str, Any], + skip_fields: set[str], + array_type: ArrayType | None = None, ) -> None: """Load standard arrays from arrays/ folder, skipping custom fields.""" try: @@ -306,11 +323,9 @@ def _load_standard_arrays( if not file_str.endswith("/array.bin"): continue - # Extract array name: "arrays/markerIndices/boundary/array.bin" -> "markerIndices.boundary" array_path = file_str[7:-10] # Remove "arrays/" and "/array.bin" name = array_path.replace("/", ".") - # Skip custom fields base_field = name.split(".")[0] if base_field in skip_fields: continue @@ -318,7 +333,6 @@ def _load_standard_arrays( decoded = ArrayUtils.load_array(handler, name, array_type) if "." in name: - # Nested array - build nested structure parts = name.split(".") current = data for part in parts[:-1]: @@ -327,42 +341,29 @@ def _load_standard_arrays( current = current[part] current[parts[-1]] = decoded else: - # Flat array data[name] = decoded - def _encode_standard_arrays(self, skip_fields: Set[str]) -> Dict[str, bytes]: + def _encode_standard_arrays(self, skip_fields: set[str]) -> dict[str, bytes]: """Encode standard arrays, skipping custom fields.""" encoded_arrays = {} for field_name in self.array_fields: - # Skip fields with custom encoding if field_name in skip_fields: continue - # Handle nested array paths (e.g., "textures.diffuse") if "." in field_name: parts = field_name.split(".") obj = self for part in parts[:-1]: - if isinstance(obj, dict): - obj = obj[part] - else: - obj = getattr(obj, part) - - if isinstance(obj, dict): - array = obj[parts[-1]] - else: - array = getattr(obj, parts[-1]) - + obj = obj[part] if isinstance(obj, dict) else getattr(obj, part) + array = obj[parts[-1]] if isinstance(obj, dict) else getattr(obj, parts[-1]) if ArrayUtils.is_array(array): encoded_arrays[field_name] = ArrayUtils.encode_array(array) else: - # Handle direct array fields try: array = getattr(self, field_name) if ArrayUtils.is_array(array): - encoded_arrays[field_name] = ArrayUtils.encode_array( - array) + encoded_arrays[field_name] = ArrayUtils.encode_array(array) except AttributeError: pass @@ -376,86 +377,21 @@ def _encode_custom_fields(self, handler: DataHandler) -> None: encoded_bytes = config.encode(value, self) handler.write_binary(f"{config.file_name}.bin", encoded_bytes) - def _encode_packable_fields( - self, - handler: DataHandler, - cache_handler: Optional[DataHandler] = None - ) -> Dict[str, str]: - """Encode fields that are Packable instances. - - Args: - handler: DataHandler for the parent zip (used when no cache) - cache_handler: Optional DataHandler to save to cache. When provided, - packables are saved via cache_handler.write_binary() and - only hash refs are returned. - - Returns: - Dict mapping field names to SHA256 hashes (only when cache_handler provided) - """ - packable_refs: Dict[str, str] = {} - - for field_name, packable in self._get_packable_fields().items(): - # Recursively use cache for nested packables too - encoded_bytes = packable.encode(cache_handler=cache_handler) - - if cache_handler is not None: - # Compute SHA256 hash of the encoded bytes - hash_digest = hashlib.sha256(encoded_bytes).hexdigest()[:16] - packable_refs[field_name] = hash_digest - - # Save to cache with deduplication via exists check - hash_path = f"{hash_digest}.zip" - if not cache_handler.exists(hash_path): - cache_handler.write_binary(hash_path, encoded_bytes) - else: - # Embed in parent zip as before - handler.write_binary(f"packables/{field_name}.zip", encoded_bytes) - - return packable_refs - - def encode(self, cache_handler: Optional[DataHandler] = None) -> bytes: - """ - Serialize this Packable to bytes. - - Args: - cache_handler: Optional DataHandler to save nested Packables to cache. - When provided, nested Packable fields are saved via - cache_handler.write_binary() instead of - embedding in the zip. - - Returns: - Bytes containing the zip-encoded data - """ + def encode(self) -> bytes: + """Serialize this Packable to bytes (zip format).""" custom_field_names = self._get_custom_field_names() - packable_field_names = self._get_packable_field_names() - skip_fields = custom_field_names | packable_field_names - - # Encode standard arrays - encoded_arrays = self._encode_standard_arrays(skip_fields) - - # Create metadata + encoded_arrays = self._encode_standard_arrays(custom_field_names) field_data = self._extract_non_array_fields() metadata = self._create_metadata(field_data) - # Write to zip - destination = ZipBuffer() + destination = BytesIO() handler = DataHandler.create(destination) - # Save standard arrays for name in sorted(encoded_arrays.keys()): ArrayUtils.save_array(handler, name, encoded_arrays[name]) - # Save custom encoded fields self._encode_custom_fields(handler) - # Save packable fields (with optional caching) - packable_refs = self._encode_packable_fields(handler, cache_handler) - - # Store packable refs in metadata if using cache - if packable_refs: - metadata.packable_refs = packable_refs - - # Save metadata handler.write_text( "metadata.json", json.dumps(metadata.model_dump(), indent=2, sort_keys=True), @@ -465,183 +401,122 @@ def encode(self, cache_handler: Optional[DataHandler] = None) -> bytes: return destination.getvalue() @classmethod - def _decode_packable_fields( - cls, - handler: DataHandler, - metadata: PackableMetadata, - data: Dict[str, Any], - array_type: Optional[ArrayType] = None, - cache_handler: Optional[DataHandler] = None - ) -> None: - """Decode fields that are Packable instances. + def decode( + cls: type[TPackable], + buf: bytes, + array_type: ArrayType | None = None, + ) -> TPackable: + """Deserialize a Packable from bytes.""" + if cls is Packable: + raise TypeError( + "Cannot decode on base Packable class. " + "Use the specific subclass: MyClass.decode(...)" + ) - Supports both embedded packables (in packables/ folder) and cached - packables (referenced by SHA256 hash in metadata.packable_refs). + handler = DataHandler.create(BytesIO(buf)) + metadata = cls.load_metadata(handler) + skip_fields = cls._get_custom_field_names() - Args: - handler: DataHandler for the parent zip - metadata: Loaded metadata containing packable_refs - data: Dict to populate with decoded packables - array_type: Optional array backend to use - cache_handler: Optional DataHandler to load cached packables by hash - """ - # Get field type hints to know the Packable subclass for each field - import typing - hints = typing.get_type_hints(cls) - - # Helper to decode a packable field given its bytes - def decode_field(field_name: str, encoded_bytes: bytes) -> None: - field_type = hints.get(field_name) - if field_type is None: - return - - # Handle Optional[PackableSubclass] - origin = typing.get_origin(field_type) - if origin is Union: - args = typing.get_args(field_type) - for arg in args: - if isinstance(arg, type) and issubclass(arg, Packable): - field_type = arg - break - - if not isinstance(field_type, type) or not issubclass(field_type, Packable): - return - - data[field_name] = field_type.decode(encoded_bytes, array_type, cache_handler) - - # First, try to load from cache using hash refs - if cache_handler and metadata.packable_refs: - for field_name, hash_digest in metadata.packable_refs.items(): - try: - cached_bytes = cache_handler.read_binary(f"{hash_digest}.zip") - decode_field(field_name, cached_bytes) - except (FileNotFoundError, KeyError): - pass # Not in cache, will try embedded + data: dict[str, Any] = {} + cls._decode_custom_fields(handler, metadata, data, array_type) + cls._load_standard_arrays(handler, data, skip_fields, array_type) - # Then load any embedded packables (for backward compatibility or no-cache case) - try: - packable_files = handler.list_files("packables", recursive=True) - except (KeyError, FileNotFoundError): - return + if metadata.field_data: + SchemaUtils.merge_field_data_with_schema(cls, data, metadata.field_data) - for file_path in packable_files: - file_str = str(file_path) - if not file_str.endswith(".zip"): - continue + return cls(**data) - # Extract field name: "packables/inner_mesh.zip" -> "inner_mesh" - field_name = file_str[10:-4] # Remove "packables/" and ".zip" + @staticmethod + def extract(obj: BaseModel) -> SerializedPackableData: + """Extract arrays and Packables from a BaseModel into serializable data and assets. - # Skip if already loaded from cache - if field_name in data: + Returns: + SerializedPackableData with data dict (refs for arrays) and assets dict + """ + if not isinstance(obj, BaseModel): + raise TypeError(f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}.") + + assets: dict[str, bytes] = {} + data: dict[str, Any] = {} + + for field_name in type(obj).model_fields: + if hasattr(obj, "__private_attributes__") and field_name in obj.__private_attributes__: continue + value = getattr(obj, field_name, None) + if value is not None: + data[field_name] = SerializationUtils.extract_value(value, assets) - encoded_bytes = handler.read_binary(file_str) - decode_field(field_name, encoded_bytes) + # Include computed fields (Pydantic v2) + for field_name in type(obj).model_computed_fields: + value = getattr(obj, field_name, None) + if value is not None: + data[field_name] = SerializationUtils.extract_value(value, assets) - @classmethod - def decode( - cls: Type[TPackable], - buf: bytes, - array_type: Optional[ArrayType] = None, - cache_handler: Optional[DataHandler] = None - ) -> TPackable: - """ - Deserialize a Packable from bytes. + return SerializedPackableData(data=data, assets=assets) - Args: - buf: Bytes containing the zip-encoded data - array_type: Array backend to use. If None (default), uses the - array_type stored in each array's metadata. - cache_handler: Optional DataHandler to load nested Packables from cache. - When metadata contains hash references, - cache_handler.read_binary() is called to retrieve - cached bytes. + @staticmethod + def compute_checksum( + obj: Union[bytes, "SerializedPackableData", "Packable", BaseModel], + ) -> str: + """Compute SHA256 checksum for various types of data. Returns: - Loaded Packable instance + 16-character hex string (first 64 bits of SHA256) """ - handler = DataHandler.create(ZipBuffer(buf)) - metadata = cls.load_metadata(handler) + if isinstance(obj, bytes): + return ChecksumUtils.compute_bytes_checksum(obj) - # Fields to skip when loading standard arrays - skip_fields = cls._get_custom_field_names() | cls._get_packable_field_types() + if isinstance(obj, SerializedPackableData): + return ChecksumUtils.compute_dict_checksum(obj.data, obj.assets) - data: Dict[str, Any] = {} + if isinstance(obj, Packable): + return ChecksumUtils.compute_bytes_checksum(obj.encode()) - # Decode custom fields first - cls._decode_custom_fields(handler, metadata, data, array_type) - - # Load standard arrays - cls._load_standard_arrays(handler, data, skip_fields, array_type) + if isinstance(obj, BaseModel): + extracted = Packable.extract(obj) + return ChecksumUtils.compute_dict_checksum(extracted.data, extracted.assets) - # Decode packable fields - cls._decode_packable_fields(handler, metadata, data, array_type, cache_handler) + raise TypeError( + f"compute_checksum() requires bytes, SerializedPackableData, Packable, or BaseModel, " + f"got {type(obj).__name__}" + ) - # Merge non-array fields from metadata - if metadata.field_data: - Packable._merge_field_data(data, metadata.field_data) + @staticmethod + def reconstruct( + model_class: type[TModel], + data: dict[str, Any], + assets: AssetProvider, + array_type: ArrayType | None = None, + ) -> Union[TModel, LazyModel[TModel]]: + """Reconstruct a Pydantic BaseModel from extracted data and assets. + + If assets is a dict, returns the actual model (eager loading). + If assets is a callable or CachedAssetLoader, returns a LazyModel proxy. + """ + if callable(assets) or isinstance(assets, CachedAssetLoader): + return LazyModel(model_class, data, assets, array_type) - return cls(**data) + resolved_data = SchemaUtils.resolve_refs_with_schema(model_class, data, assets, array_type) + return model_class(**resolved_data) def __reduce__(self): - """ - Support for pickle serialization. - - Array types are preserved automatically via the per-array metadata. - """ - return ( - self.__class__.decode, - (self.encode(),), - ) + """Support for pickle serialization.""" + return (self.__class__.decode, (self.encode(),)) @staticmethod def load_array( - source: Union[PathLike, BytesIO], - name: str, - array_type: Optional[ArrayType] = None + source: Union[PathLike, BytesIO], name: str, array_type: ArrayType | None = None ) -> Array: - """ - Load a single array from a zip file without loading the entire object. - - Useful for large files where you only need one array. - - Args: - source: Path to the zip file or BytesIO buffer - name: Array name (e.g., "normals" or "markerIndices.boundary") - array_type: Array backend to use ("numpy" or "jax"). If None (default), - uses the array_type stored in the array's metadata. - - Returns: - Decoded array (numpy or JAX) - - Raises: - KeyError: If array not found in zip - - Example: - normals = Mesh.load_array("mesh.zip", "normals") - """ + """Load a single array from a zip file without loading the entire object.""" if isinstance(source, BytesIO): source.seek(0) - handler = DataHandler.create(ZipBuffer(source.read())) + handler = DataHandler.create(BytesIO(source.read())) else: - with open(source, "rb") as f: - handler = DataHandler.create(ZipBuffer(f.read())) + handler = DataHandler.create(BytesIO(Path(source).read_bytes())) return ArrayUtils.load_array(handler, name, array_type) def convert_to(self: TPackable, array_type: ArrayType) -> TPackable: - """ - Create a new Packable with all arrays converted to the specified type. - - Args: - array_type: Target array backend ("numpy" or "jax") - - Returns: - A new Packable with all arrays converted - - Raises: - AssertionError: If JAX is requested but not available - """ + """Create a new Packable with all arrays converted to the specified type.""" data_copy = self.model_copy(deep=True) for field_name in data_copy.model_fields_set: @@ -654,49 +529,3 @@ def convert_to(self: TPackable, array_type: ArrayType) -> TPackable: pass return data_copy - - @staticmethod - def _reconstruct_model(data: Dict[str, Any]) -> Any: - """Reconstruct BaseModel from serialized dict with __model_class__/__model_module__.""" - if not isinstance(data, dict): - return data - - # Recursively process nested dicts first - processed = {k: Packable._reconstruct_model(v) if isinstance(v, dict) else v - for k, v in data.items() if k not in ("__model_class__", "__model_module__")} - - if "__model_class__" not in data: - return processed - - try: - import importlib - module = importlib.import_module(data["__model_module__"]) - model_class = getattr(module, data["__model_class__"]) - return model_class(**processed) - except (ImportError, AttributeError): - return processed - - @staticmethod - def _merge_field_data(data: Dict[str, Any], field_data: Dict[str, Any]) -> None: - """Merge metadata fields into data, reconstructing BaseModel instances.""" - for key, value in field_data.items(): - existing = data.get(key) - if not isinstance(value, dict): - data[key] = value - elif "__model_class__" in value: - # Single BaseModel: merge arrays then reconstruct - merged = {**value, ** - (existing if isinstance(existing, dict) else {})} - data[key] = Packable._reconstruct_model(merged) - elif isinstance(existing, dict): - # Check if dict of BaseModels - for subkey, subval in value.items(): - if isinstance(subval, dict) and "__model_class__" in subval: - merged = {**subval, **existing.get(subkey, {})} - existing[subkey] = Packable._reconstruct_model(merged) - elif isinstance(subval, dict) and isinstance(existing.get(subkey), dict): - Packable._merge_field_data(existing[subkey], subval) - else: - existing[subkey] = subval - else: - data[key] = Packable._reconstruct_model(value) diff --git a/python/meshly/utils/__init__.py b/python/meshly/utils/__init__.py index 8c5d72a..7674b30 100644 --- a/python/meshly/utils/__init__.py +++ b/python/meshly/utils/__init__.py @@ -2,14 +2,20 @@ Utility modules for meshly. This package contains utility functions for mesh operations, element handling, -and triangulation. +triangulation, checksums, serialization, and schema operations. """ +from .checksum_utils import ChecksumUtils from .element_utils import ElementUtils, TriangulationUtils from .mesh_utils import MeshUtils +from .schema_utils import SchemaUtils +from .serialization_utils import SerializationUtils __all__ = [ + "ChecksumUtils", "ElementUtils", - "TriangulationUtils", "MeshUtils", + "SchemaUtils", + "SerializationUtils", + "TriangulationUtils", ] diff --git a/python/meshly/utils/checksum_utils.py b/python/meshly/utils/checksum_utils.py new file mode 100644 index 0000000..e1af476 --- /dev/null +++ b/python/meshly/utils/checksum_utils.py @@ -0,0 +1,152 @@ +"""Checksum utilities for hashing data, files, and directories.""" + +import hashlib +import json +from pathlib import Path +from typing import Any, Optional + + +class ChecksumUtils: + """Utility class for computing checksums.""" + + # Thresholds for switching to fast checksum strategy + LARGE_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB + LARGE_DIR_FILE_COUNT_THRESHOLD = 100 + + @staticmethod + def compute_bytes_checksum(data: bytes) -> str: + """Compute SHA256 checksum for bytes. + + Args: + data: Bytes to hash + + Returns: + 16-character hex string (first 64 bits of SHA256) + """ + return hashlib.sha256(data).hexdigest()[:16] + + @staticmethod + def compute_dict_checksum(data: dict[str, Any], assets: dict[str, bytes]) -> str: + """Compute checksum for a data dict with assets. + + Combines data JSON + all asset bytes for deterministic hashing. + + Args: + data: JSON-serializable dict + assets: Map of checksum -> bytes + + Returns: + 16-character hex string + """ + data_json = json.dumps(data, sort_keys=True).encode("utf-8") + hasher = hashlib.sha256() + hasher.update(data_json) + hasher.update(b"\x00") + for checksum in sorted(assets.keys()): + hasher.update(assets[checksum]) + return hasher.hexdigest()[:16] + + @staticmethod + def compute_file_checksum(file_path: Path, fast: bool = False) -> str: + """Compute checksum of a file. + + Args: + file_path: Path to the file + fast: If True, use file metadata (size, mtime) instead of content hash + for large files. This is much faster but less accurate. + + Returns: + Full SHA256 checksum string + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + if not file_path.is_file(): + raise ValueError(f"Path is not a file: {file_path}") + + file_size = file_path.stat().st_size + + if fast and file_size > ChecksumUtils.LARGE_FILE_THRESHOLD: + return ChecksumUtils._compute_file_metadata_checksum(file_path) + + return ChecksumUtils._compute_file_content_checksum(file_path) + + @staticmethod + def _compute_file_content_checksum(file_path: Path) -> str: + """Compute SHA256 checksum of file contents.""" + hasher = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hasher.update(chunk) + return hasher.hexdigest() + + @staticmethod + def _compute_file_metadata_checksum(file_path: Path) -> str: + """Compute checksum based on file metadata (path, size, mtime).""" + stat = file_path.stat() + metadata = f"{file_path.resolve()}|{stat.st_size}|{stat.st_mtime}" + return hashlib.sha256(metadata.encode()).hexdigest() + + @staticmethod + def compute_directory_checksum(dir_path: Path, fast: Optional[bool] = None) -> str: + """Compute checksum of a directory. + + Args: + dir_path: Path to the directory + fast: If True, use file metadata instead of content hashes. + If None (default), automatically use fast strategy for large directories. + + Returns: + Full SHA256 checksum string combining all file checksums + """ + dir_path = Path(dir_path) + if not dir_path.exists(): + raise FileNotFoundError(f"Directory not found: {dir_path}") + if not dir_path.is_dir(): + raise ValueError(f"Path is not a directory: {dir_path}") + + all_files = sorted(dir_path.rglob("*")) + file_paths = [f for f in all_files if f.is_file()] + + if fast is None: + fast = len(file_paths) > ChecksumUtils.LARGE_DIR_FILE_COUNT_THRESHOLD + + hasher = hashlib.sha256() + + for file_path in file_paths: + rel_path = file_path.relative_to(dir_path) + hasher.update(str(rel_path).encode()) + + if fast: + file_hash = ChecksumUtils._compute_file_metadata_checksum(file_path) + else: + file_hash = ChecksumUtils._compute_file_content_checksum(file_path) + + hasher.update(file_hash.encode()) + + return hasher.hexdigest() + + @staticmethod + def compute_path_checksum(path: Path, fast: Optional[bool] = None) -> str: + """Compute checksum of a file or directory. + + Args: + path: Path to file or directory + fast: If True, use metadata-based checksums for speed. + If None, automatically use fast strategy for large files/directories. + + Returns: + Full SHA256 checksum string + """ + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"Path not found: {path}") + + if path.is_file(): + return ChecksumUtils.compute_file_checksum( + path, fast=fast if fast is not None else False + ) + elif path.is_dir(): + return ChecksumUtils.compute_directory_checksum(path, fast=fast) + else: + raise ValueError(f"Path is neither a file nor directory: {path}") diff --git a/python/meshly/utils/schema_utils.py b/python/meshly/utils/schema_utils.py new file mode 100644 index 0000000..5826036 --- /dev/null +++ b/python/meshly/utils/schema_utils.py @@ -0,0 +1,224 @@ +"""Schema utilities for resolving Pydantic types and merging field data.""" + +from typing import Any, Union, get_args, get_origin + +from pydantic import BaseModel + +from ..array import ArrayType +from ..data_handler import AssetProvider +from .serialization_utils import SerializationUtils + + +class SchemaUtils: + """Utility class for Pydantic schema operations.""" + + @staticmethod + def unwrap_optional(expected_type: Any) -> Any: + """Unwrap Optional[X] to X. + + Args: + expected_type: Type annotation, possibly Optional + + Returns: + Inner type if Optional, otherwise unchanged + """ + origin = get_origin(expected_type) + if origin is Union: + args = get_args(expected_type) + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + return non_none[0] + return expected_type + + @staticmethod + def resolve_refs_with_schema( + model_class: type[BaseModel], + data: dict[str, Any], + assets: AssetProvider, + array_type: ArrayType | None, + ) -> dict[str, Any]: + """Resolve $ref references using Pydantic schema for type information. + + Args: + model_class: Pydantic model class with field definitions + data: Data dict with potential $ref values + assets: Asset provider + array_type: Target array type + + Returns: + Resolved data dict + """ + result = {} + + for field_name, field_info in model_class.model_fields.items(): + if field_name not in data: + continue + + result[field_name] = SchemaUtils.resolve_value_with_type( + data[field_name], field_info.annotation, assets, array_type + ) + + return result + + @staticmethod + def resolve_value_with_type( + value: Any, + expected_type: Any, + assets: AssetProvider, + array_type: ArrayType | None, + ) -> Any: + """Resolve a value using the expected type from Pydantic schema. + + Args: + value: Value to resolve + expected_type: Expected type from schema + assets: Asset provider + array_type: Target array type + + Returns: + Resolved value + """ + # Import here to avoid circular imports + from ..packable import Packable + + if value is None: + return None + + # Handle $ref + if isinstance(value, dict) and "$ref" in value: + checksum = value["$ref"] + asset_bytes = SerializationUtils.get_asset(assets, checksum) + + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + if isinstance(expected_type, type) and issubclass(expected_type, Packable): + return expected_type.decode(asset_bytes, array_type) + + return SerializationUtils.unpack_array(asset_bytes, array_type) + + # Handle nested dict + if isinstance(value, dict): + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + if origin is dict: + _, value_type = get_args(expected_type) + return { + k: SchemaUtils.resolve_value_with_type(v, value_type, assets, array_type) + for k, v in value.items() + } + + if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): + resolved = SchemaUtils.resolve_refs_with_schema( + expected_type, value, assets, array_type + ) + return expected_type(**resolved) + + return value + + # Handle lists/tuples + if isinstance(value, (list, tuple)): + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + if origin in (list, tuple): + args = get_args(expected_type) + elem_type = args[0] if args else Any + else: + elem_type = Any + + result = [ + SchemaUtils.resolve_value_with_type(v, elem_type, assets, array_type) + for v in value + ] + return result if isinstance(value, list) else tuple(result) + + return value + + @staticmethod + def merge_field_data_with_schema( + model_class: type[BaseModel], + data: dict[str, Any], + field_data: dict[str, Any], + ) -> None: + """Merge metadata field_data into data using Pydantic schema. + + Args: + model_class: Pydantic model class + data: Target data dict (modified in place) + field_data: Source field data from metadata + """ + for key, value in field_data.items(): + if key not in model_class.model_fields: + data[key] = value + continue + + field_type = model_class.model_fields[key].annotation + data[key] = SchemaUtils.merge_value_with_schema(value, field_type, data.get(key)) + + @staticmethod + def merge_value_with_schema( + metadata_value: Any, + expected_type: Any, + existing_value: Any, + ) -> Any: + """Merge a metadata value with existing data using the schema type. + + Args: + metadata_value: Value from metadata + expected_type: Expected type from schema + existing_value: Existing value in data dict + + Returns: + Merged value + """ + if metadata_value is None: + return existing_value + + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + # Handle dict type + if origin is dict: + _, value_type = get_args(expected_type) + if isinstance(metadata_value, dict) and isinstance(existing_value, dict): + result = dict(existing_value) + for k, v in metadata_value.items(): + result[k] = SchemaUtils.merge_value_with_schema( + v, value_type, existing_value.get(k) + ) + return result + elif isinstance(metadata_value, dict): + return { + k: SchemaUtils.merge_value_with_schema(v, value_type, None) + for k, v in metadata_value.items() + } + return metadata_value + + # Handle BaseModel type + if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): + if isinstance(metadata_value, dict): + if isinstance(existing_value, dict): + merged = dict(existing_value) + SchemaUtils.merge_field_data_with_schema(expected_type, merged, metadata_value) + return expected_type(**merged) + else: + data = {} + SchemaUtils.merge_field_data_with_schema(expected_type, data, metadata_value) + return expected_type(**data) + return metadata_value + + # Handle list type + if origin in (list, tuple): + if isinstance(metadata_value, (list, tuple)): + args = get_args(expected_type) + elem_type = args[0] if args else Any + result = [ + SchemaUtils.merge_value_with_schema(v, elem_type, None) + for v in metadata_value + ] + return result if origin is list else tuple(result) + return metadata_value + + return metadata_value diff --git a/python/meshly/utils/serialization_utils.py b/python/meshly/utils/serialization_utils.py new file mode 100644 index 0000000..a078ee5 --- /dev/null +++ b/python/meshly/utils/serialization_utils.py @@ -0,0 +1,177 @@ +"""Serialization utilities for packing/unpacking arrays and assets.""" + +import asyncio +import inspect +import json +from typing import Any + +from pydantic import BaseModel + +from ..array import ArrayMetadata, ArrayType, ArrayUtils, EncodedArray +from ..data_handler import AssetProvider, CachedAssetLoader +from .checksum_utils import ChecksumUtils + + +class SerializationUtils: + """Utility class for serialization operations.""" + + @staticmethod + def pack_array(encoded: EncodedArray) -> bytes: + """Pack an encoded array into bytes with metadata. + + Format: [4 bytes metadata length][metadata json][array data] + + Args: + encoded: EncodedArray with metadata and data + + Returns: + Packed bytes + """ + metadata_json = json.dumps(encoded.metadata.model_dump()).encode("utf-8") + return len(metadata_json).to_bytes(4, "little") + metadata_json + encoded.data + + @staticmethod + def unpack_array(packed: bytes, array_type: ArrayType | None = None) -> Any: + """Unpack bytes back to an array. + + Args: + packed: Packed bytes from pack_array + array_type: Target array type, or None to use stored type + + Returns: + Decoded array (numpy or JAX) + """ + metadata_len = int.from_bytes(packed[:4], "little") + metadata_json = packed[4 : 4 + metadata_len].decode("utf-8") + array_data = packed[4 + metadata_len :] + + metadata_dict = json.loads(metadata_json) + metadata = ArrayMetadata(**metadata_dict) + encoded = EncodedArray(data=array_data, metadata=metadata) + + decoded = ArrayUtils.decode_array(encoded) + + if array_type is not None: + return ArrayUtils.convert_array(decoded, array_type) + elif metadata.array_type != "numpy": + return ArrayUtils.convert_array(decoded, metadata.array_type) + return decoded + + @staticmethod + def get_asset(assets: AssetProvider, checksum: str) -> bytes: + """Get asset bytes from a provider (dict, callable, or CachedAssetLoader). + + Supports both sync and async callables. + + Args: + assets: Asset provider (dict, callable, or CachedAssetLoader) + checksum: Asset checksum to fetch + + Returns: + Asset bytes + + Raises: + KeyError: If asset not found + """ + if callable(assets): + result = assets(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + return result + if checksum not in assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return assets[checksum] + + @staticmethod + def get_cached_asset( + assets: AssetProvider, + checksum: str, + ) -> bytes: + """Get asset bytes with caching support for CachedAssetLoader. + + Args: + assets: Asset provider + checksum: Asset checksum + + Returns: + Asset bytes + + Raises: + KeyError: If asset not found + """ + if isinstance(assets, CachedAssetLoader): + cache_path = f"assets/{checksum}.bin" + + try: + return assets.cache.read_binary(cache_path) + except (KeyError, FileNotFoundError): + pass + + result = assets.fetch(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + + if result is None: + try: + return assets.cache.read_binary(cache_path) + except (KeyError, FileNotFoundError): + raise KeyError(f"Asset '{checksum}' not found in remote or cache") + + assets.cache.write_binary(cache_path, result) + return result + + if callable(assets): + result = assets(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + if result is None: + raise KeyError(f"Asset fetcher returned None for checksum '{checksum}'") + return result + + if checksum not in assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return assets[checksum] + + @staticmethod + def extract_value(value: Any, assets: dict[str, bytes]) -> Any: + """Recursively extract a value, replacing arrays and Packables with refs. + + Args: + value: Value to extract + assets: Dict to populate with encoded assets + + Returns: + Extracted value with $ref for arrays/Packables + """ + # Import here to avoid circular imports + from ..packable import Packable + + if ArrayUtils.is_array(value): + encoded = ArrayUtils.encode_array(value) + packed = SerializationUtils.pack_array(encoded) + checksum = ChecksumUtils.compute_bytes_checksum(packed) + assets[checksum] = packed + return {"$ref": checksum} + + if isinstance(value, Packable): + encoded = value.encode() + checksum = ChecksumUtils.compute_bytes_checksum(encoded) + assets[checksum] = encoded + return {"$ref": checksum} + + if isinstance(value, dict): + return {k: SerializationUtils.extract_value(v, assets) for k, v in value.items()} + + if isinstance(value, (list, tuple)): + result = [SerializationUtils.extract_value(v, assets) for v in value] + return result if isinstance(value, list) else tuple(result) + + if isinstance(value, BaseModel): + extracted = {} + for name in value.model_fields: + field_value = getattr(value, name, None) + if field_value is not None: + extracted[name] = SerializationUtils.extract_value(field_value, assets) + return extracted + + return value diff --git a/python/pyproject.toml b/python/pyproject.toml index ba84cf1..b2367ec 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "meshly" -version = "2.4.0-alpha" +version = "2.5.0-alpha" description = "High-level abstractions and utilities for working with meshoptimizer" readme = "README.md" license = {text = "MIT"} diff --git a/python/tests/test_checksum_utils.py b/python/tests/test_checksum_utils.py new file mode 100644 index 0000000..f968f72 --- /dev/null +++ b/python/tests/test_checksum_utils.py @@ -0,0 +1,166 @@ +"""Tests for ChecksumUtils.""" + +import shutil +import tempfile +from pathlib import Path + +import pytest +from meshly.utils.checksum_utils import ChecksumUtils + + +@pytest.fixture +def temp_dir(): + """Create and clean up a temporary directory.""" + d = tempfile.mkdtemp() + yield Path(d) + shutil.rmtree(d) + + +@pytest.fixture +def test_file(temp_dir): + """Create a simple test file.""" + f = temp_dir / "test_file.txt" + f.write_text("Hello, World!") + return f + + +@pytest.fixture +def test_subdir(temp_dir): + """Create a test directory with multiple files.""" + subdir = temp_dir / "subdir" + subdir.mkdir() + (subdir / "file1.txt").write_text("Content 1") + (subdir / "file2.txt").write_text("Content 2") + + nested = subdir / "nested" + nested.mkdir() + (nested / "file3.txt").write_text("Content 3") + + return subdir + + +class TestFileChecksum: + """Tests for file checksum computation.""" + + def test_returns_string(self, test_file): + """Test that file checksum returns a hex string.""" + result = ChecksumUtils.compute_file_checksum(test_file) + assert isinstance(result, str) + assert len(result) == 64 # SHA256 produces 64 hex chars + + def test_is_deterministic(self, test_file): + """Test that same file produces same checksum.""" + result1 = ChecksumUtils.compute_file_checksum(test_file) + result2 = ChecksumUtils.compute_file_checksum(test_file) + assert result1 == result2 + + def test_differs_for_different_content(self, temp_dir): + """Test that different content produces different checksum.""" + file1 = temp_dir / "a.txt" + file2 = temp_dir / "b.txt" + file1.write_text("Content A") + file2.write_text("Content B") + + assert ChecksumUtils.compute_file_checksum(file1) != ChecksumUtils.compute_file_checksum(file2) + + def test_not_found_raises(self, temp_dir): + """Test that missing file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + ChecksumUtils.compute_file_checksum(temp_dir / "nonexistent.txt") + + def test_on_directory_raises_error(self, test_subdir): + """Test that passing a directory raises ValueError.""" + with pytest.raises(ValueError): + ChecksumUtils.compute_file_checksum(test_subdir) + + def test_fast_mode_uses_metadata(self, temp_dir): + """Test that fast mode produces valid checksum.""" + large_file = temp_dir / "large.txt" + large_file.write_text("Some content") + + result_fast = ChecksumUtils.compute_file_checksum(large_file, fast=True) + result_normal = ChecksumUtils.compute_file_checksum(large_file, fast=False) + + assert isinstance(result_fast, str) + assert len(result_fast) == 64 + # For small files, fast=True still uses content hash + assert result_fast == result_normal + + +class TestDirectoryChecksum: + """Tests for directory checksum computation.""" + + def test_returns_string(self, test_subdir): + """Test that directory checksum returns a hex string.""" + result = ChecksumUtils.compute_directory_checksum(test_subdir) + assert isinstance(result, str) + assert len(result) == 64 + + def test_is_deterministic(self, test_subdir): + """Test that same directory produces same checksum.""" + result1 = ChecksumUtils.compute_directory_checksum(test_subdir) + result2 = ChecksumUtils.compute_directory_checksum(test_subdir) + assert result1 == result2 + + def test_changes_with_content(self, test_subdir): + """Test that modifying a file changes directory checksum.""" + checksum_before = ChecksumUtils.compute_directory_checksum(test_subdir) + (test_subdir / "file1.txt").write_text("Modified content") + checksum_after = ChecksumUtils.compute_directory_checksum(test_subdir) + assert checksum_before != checksum_after + + def test_not_found_raises(self, temp_dir): + """Test that missing directory raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + ChecksumUtils.compute_directory_checksum(temp_dir / "nonexistent_dir") + + def test_on_file_raises_error(self, test_file): + """Test that passing a file raises ValueError.""" + with pytest.raises(ValueError): + ChecksumUtils.compute_directory_checksum(test_file) + + def test_fast_mode(self, test_subdir): + """Test that fast mode works for directories.""" + result_fast = ChecksumUtils.compute_directory_checksum(test_subdir, fast=True) + assert isinstance(result_fast, str) + assert len(result_fast) == 64 + + def test_empty_directory(self, temp_dir): + """Test checksum of an empty directory.""" + empty_dir = temp_dir / "empty" + empty_dir.mkdir() + + result = ChecksumUtils.compute_directory_checksum(empty_dir) + assert isinstance(result, str) + assert len(result) == 64 + + def test_includes_structure(self, temp_dir): + """Test that directory structure affects checksum.""" + dir1 = temp_dir / "dir1" + dir2 = temp_dir / "dir2" + dir1.mkdir() + dir2.mkdir() + + (dir1 / "a.txt").write_text("content") + (dir2 / "b.txt").write_text("content") # Same content, different name + + assert ChecksumUtils.compute_directory_checksum(dir1) != ChecksumUtils.compute_directory_checksum(dir2) + + +class TestPathChecksum: + """Tests for unified path checksum.""" + + def test_file(self, test_file): + """Test that compute_path_checksum works for files.""" + result = ChecksumUtils.compute_path_checksum(test_file) + assert result == ChecksumUtils.compute_file_checksum(test_file) + + def test_directory(self, test_subdir): + """Test that compute_path_checksum works for directories.""" + result = ChecksumUtils.compute_path_checksum(test_subdir) + assert result == ChecksumUtils.compute_directory_checksum(test_subdir) + + def test_not_found_raises(self, temp_dir): + """Test that missing path raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + ChecksumUtils.compute_path_checksum(temp_dir / "nonexistent") diff --git a/python/tests/test_packable.py b/python/tests/test_packable.py index 0fd8a84..20b0ab2 100644 --- a/python/tests/test_packable.py +++ b/python/tests/test_packable.py @@ -3,8 +3,10 @@ import pytest import tempfile import os +import json from io import BytesIO -from typing import Optional +from pathlib import Path +from typing import Optional, Dict, Any import numpy as np from pydantic import BaseModel, Field, ConfigDict @@ -43,7 +45,8 @@ class FieldData(BaseModel): class Snapshot(Packable): """Snapshot with dict of BaseModel containing arrays.""" time: float = Field(..., description="Time value") - fields: dict[str, FieldData] = Field(default_factory=dict, description="Field data") + fields: dict[str, FieldData] = Field( + default_factory=dict, description="Field data") class TestPackable: @@ -100,8 +103,10 @@ def test_save_load_zip_file(self): loaded = SimulationResult.load_from_zip(path) assert loaded.time == pytest.approx(original.time) - np.testing.assert_array_almost_equal(loaded.temperature, original.temperature) - np.testing.assert_array_almost_equal(loaded.velocity, original.velocity) + np.testing.assert_array_almost_equal( + loaded.temperature, original.temperature) + np.testing.assert_array_almost_equal( + loaded.velocity, original.velocity) def test_save_load_bytesio(self): """Test saving and loading from BytesIO.""" @@ -140,8 +145,10 @@ def test_nested_dict_arrays(self): loaded = NestedData.load_from_zip(buffer) assert loaded.label == data.label - np.testing.assert_array_almost_equal(loaded.fields["pressure"], data.fields["pressure"]) - np.testing.assert_array_almost_equal(loaded.fields["density"], data.fields["density"]) + np.testing.assert_array_almost_equal( + loaded.fields["pressure"], data.fields["pressure"]) + np.testing.assert_array_almost_equal( + loaded.fields["density"], data.fields["density"]) def test_deterministic_encode(self): """Test that encode produces consistent output.""" @@ -166,7 +173,9 @@ def test_class_mismatch_error(self): data.save_to_zip(buffer) buffer.seek(0) - with pytest.raises(ValueError, match="Class mismatch"): + # Loading wrong class should fail with Pydantic validation error + # (missing required fields for SimulationResult) + with pytest.raises(Exception): # ValidationError from Pydantic SimulationResult.load_from_zip(buffer) def test_dict_of_basemodel_with_arrays(self): @@ -183,7 +192,8 @@ def test_dict_of_basemodel_with_arrays(self): "velocity": FieldData( name="velocity", type="vector", - data=np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=np.float32), + data=np.array( + [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=np.float32), units="m/s" ) } @@ -242,123 +252,512 @@ def test_dict_of_basemodel_with_optional_none_field(self): np.testing.assert_array_almost_equal( loaded.fields["pressure"].data, snapshot.fields["pressure"].data) - -class InnerPackable(Packable): - """Inner packable for testing nested support.""" - label: str = Field(..., description="Label") - data: np.ndarray = Field(..., description="Data array") - - -class OuterPackable(Packable): - """Outer packable containing a nested packable.""" - name: str = Field(..., description="Name") - inner: Optional[InnerPackable] = Field(None, description="Nested packable") - - -class TestNestedPackableCache: - """Test nested Packable with cache support.""" - - def test_nested_packable_without_cache(self): - """Test nested packable save/load without cache.""" - inner = InnerPackable( - label="inner", - data=np.array([1.0, 2.0, 3.0], dtype=np.float32) + def test_decode_without_class_raises_error(self): + """Test that Packable.decode() raises TypeError - must use specific class.""" + # Create and encode a SimpleData instance + original = SimpleData( + name="dynamic_test", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) ) - outer = OuterPackable(name="outer", inner=inner) - - buffer = BytesIO() - outer.save_to_zip(buffer) - - buffer.seek(0) - loaded = OuterPackable.load_from_zip(buffer) - - assert loaded.name == "outer" - assert loaded.inner is not None - assert loaded.inner.label == "inner" - np.testing.assert_array_almost_equal(loaded.inner.data, inner.data) + encoded = original.encode() - def test_nested_packable_with_cache(self): - """Test nested packable save/load with cache.""" - from meshly.data_handler import DataHandler + # Decode using base Packable class - should raise TypeError + with pytest.raises(TypeError, match="Cannot decode on base Packable class"): + Packable.decode(encoded) + + # Should work with the specific class + decoded = SimpleData.decode(encoded) + assert decoded.name == original.name + np.testing.assert_array_almost_equal(decoded.values, original.values) - inner = InnerPackable( - label="cached_inner", - data=np.array([4.0, 5.0, 6.0], dtype=np.float32) + def test_load_from_zip_without_class_raises_error(self): + """Test that Packable.load_from_zip() raises TypeError - must use specific class.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0, 302.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) ) - outer = OuterPackable(name="cached_outer", inner=inner) with tempfile.TemporaryDirectory() as tmpdir: - cache_path = os.path.join(tmpdir, "cache") - zip_path = os.path.join(tmpdir, "outer.zip") + path = os.path.join(tmpdir, "result.zip") + original.save_to_zip(path) - cache_handler = DataHandler.create(cache_path) - outer.save_to_zip(zip_path, cache_handler=cache_handler) + # Load using base Packable - should raise TypeError + with pytest.raises(TypeError, match="Cannot decode on base Packable class"): + Packable.load_from_zip(path) + + # Should work with the specific class + loaded = SimulationResult.load_from_zip(path) + assert loaded.time == pytest.approx(original.time) + np.testing.assert_array_almost_equal( + loaded.temperature, original.temperature) - cache_files = os.listdir(cache_path) - assert len(cache_files) == 1 - assert cache_files[0].endswith(".zip") - read_cache_handler = DataHandler.create(cache_path) - loaded = OuterPackable.load_from_zip(zip_path, cache_handler=read_cache_handler) +class TestExtractReconstruct: + """Test extract() and reconstruct() functionality.""" - assert loaded.name == "cached_outer" - assert loaded.inner is not None - assert loaded.inner.label == "cached_inner" - np.testing.assert_array_almost_equal(loaded.inner.data, inner.data) + def test_extract_simple(self): + """Test extract() returns data dict with refs and assets.""" + original = SimpleData( + name="test", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Data should have the primitive field + assert extracted.data["name"] == "test" + + # Array should be replaced with ref (no $type - we use schema) + assert "$ref" in extracted.data["values"] + checksum = extracted.data["values"]["$ref"] + + # Assets should contain the encoded array + assert checksum in extracted.assets + assert isinstance(extracted.assets[checksum], bytes) + + def test_reconstruct_simple(self): + """Test reconstruct() rebuilds the Packable from data and assets.""" + original = SimpleData( + name="roundtrip", + values=np.array([4.0, 5.0, 6.0], dtype=np.float32) + ) + + extracted = Packable.extract(original) + reconstructed = Packable.reconstruct(SimpleData, extracted.data, extracted.assets) + + assert reconstructed.name == original.name + np.testing.assert_array_almost_equal(reconstructed.values, original.values) + + def test_extract_reconstruct_simulation_result(self): + """Test extract/reconstruct with multiple arrays.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Should have 2 assets (2 arrays) + assert len(extracted.assets) == 2 + + # Primitive field should be preserved + assert extracted.data["time"] == 0.5 + + # Arrays should be refs + assert "$ref" in extracted.data["temperature"] + assert "$ref" in extracted.data["velocity"] + + # Reconstruct + reconstructed = Packable.reconstruct(SimulationResult, extracted.data, extracted.assets) + + assert reconstructed.time == pytest.approx(original.time) + np.testing.assert_array_almost_equal(reconstructed.temperature, original.temperature) + np.testing.assert_array_almost_equal(reconstructed.velocity, original.velocity) + + def test_extract_data_is_json_serializable(self): + """Test that extracted data can be JSON serialized.""" + original = SimulationResult( + time=1.0, + temperature=np.array([100.0], dtype=np.float32), + velocity=np.array([[0.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Should be able to serialize to JSON + json_str = json.dumps(extracted.data) + assert isinstance(json_str, str) + + # And deserialize back + loaded_data = json.loads(json_str) + assert loaded_data["time"] == 1.0 + + def test_reconstruct_missing_asset_raises(self): + """Test that reconstruct raises KeyError when asset is missing.""" + data = {"name": "test", "values": {"$ref": "nonexistent_checksum"}} + + with pytest.raises(KeyError, match="Missing asset"): + Packable.reconstruct(SimpleData, data, {}) + + def test_extract_requires_basemodel(self): + """Test extract() requires a Pydantic BaseModel, not plain dict.""" + data = { + "name": "test", + "positions": np.array([[0, 0, 0], [1, 1, 1]], dtype=np.float32), + } + + with pytest.raises(TypeError, match="requires a Pydantic BaseModel"): + Packable.extract(data) + + def test_reconstruct_with_callable_returns_lazy_model(self): + """Test that reconstruct() with callable returns LazyModel for lazy loading.""" + from meshly.packable import LazyModel + + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Track which assets were requested + requested_checksums = [] + + def lazy_loader(checksum: str) -> bytes: + """Simulate lazy loading from external storage.""" + requested_checksums.append(checksum) + if checksum not in extracted.assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return extracted.assets[checksum] + + # Reconstruct using callable - returns LazyModel + lazy = Packable.reconstruct( + SimulationResult, extracted.data, lazy_loader + ) + + # Should be a LazyModel, not loaded yet + assert isinstance(lazy, LazyModel) + assert len(requested_checksums) == 0 + + # Access fields to trigger loading + assert lazy.time == pytest.approx(original.time) + np.testing.assert_array_almost_equal(lazy.temperature, original.temperature) + np.testing.assert_array_almost_equal(lazy.velocity, original.velocity) + + # Now assets should be loaded + assert len(requested_checksums) == 2 + + def test_reconstruct_callable_missing_asset_raises_on_access(self): + """Test that callable asset provider raises KeyError on field access.""" + data = {"name": "test", "values": {"$ref": "nonexistent"}} + + def failing_loader(checksum: str) -> bytes: + raise KeyError(f"Missing asset with checksum '{checksum}'") + + # With callable, returns LazyModel immediately (no error) + lazy = Packable.reconstruct(SimpleData, data, failing_loader) + + # Error raised when accessing the field + with pytest.raises(KeyError, match="Missing asset"): + _ = lazy.values + + def test_lazy_reconstruct_defers_loading(self): + """Test that reconstruct() with callable doesn't load assets until accessed.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + requested_checksums = [] + + def tracking_loader(checksum: str) -> bytes: + requested_checksums.append(checksum) + return extracted.assets[checksum] + + # Create lazy model with callable - NO assets should be loaded yet + lazy = Packable.reconstruct( + SimulationResult, extracted.data, tracking_loader + ) + assert len(requested_checksums) == 0, "No assets should be loaded on creation" + + # Access primitive field - still no asset loading + assert lazy.time == pytest.approx(0.5) + assert len(requested_checksums) == 0, "Primitive access shouldn't load assets" + + # Access temperature - should load only temperature asset + temp = lazy.temperature + assert len(requested_checksums) == 1, "Should load exactly one asset" + np.testing.assert_array_almost_equal(temp, original.temperature) + + # Access temperature again - should use cache, not reload + temp2 = lazy.temperature + assert len(requested_checksums) == 1, "Cached access shouldn't reload" + + # Access velocity - should load velocity asset + vel = lazy.velocity + assert len(requested_checksums) == 2, "Should now have loaded both assets" + np.testing.assert_array_almost_equal(vel, original.velocity) + + def test_lazy_reconstruct_resolve(self): + """Test that resolve() returns the full Pydantic model.""" + original = SimulationResult( + time=1.0, + temperature=np.array([100.0], dtype=np.float32), + velocity=np.array([[0.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Use callable to get LazyModel + lazy = Packable.reconstruct( + SimulationResult, extracted.data, lambda c: extracted.assets[c] + ) + + # Resolve to get actual model + resolved = lazy.resolve() + + # Should be actual SimulationResult instance + assert isinstance(resolved, SimulationResult) + assert resolved.time == pytest.approx(1.0) + np.testing.assert_array_almost_equal(resolved.temperature, original.temperature) + + # Resolve again should return same instance + resolved2 = lazy.resolve() + assert resolved is resolved2 + + def test_lazy_model_repr(self): + """Test LazyModel has informative repr.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0], dtype=np.float32), + velocity=np.array([[1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + lazy = Packable.reconstruct( + SimulationResult, extracted.data, lambda c: extracted.assets[c] + ) + + repr_str = repr(lazy) + assert "LazyModel" in repr_str + assert "SimulationResult" in repr_str + + # After accessing one field, repr should reflect that + _ = lazy.temperature + repr_str = repr(lazy) + assert "temperature" in repr_str + + def test_lazy_model_is_readonly(self): + """Test that LazyModel doesn't allow attribute setting.""" + original = SimpleData( + name="test", + values=np.array([1.0], dtype=np.float32) + ) + + extracted = Packable.extract(original) + lazy = Packable.reconstruct( + SimpleData, extracted.data, lambda c: extracted.assets[c] + ) + + with pytest.raises(AttributeError, match="read-only"): + lazy.name = "modified" - def test_cache_deduplication(self): - """Test that identical nested packables share the same cache file.""" + def test_reconstruct_with_cache_handler(self): + """Test that CachedAssetLoader persists fetched assets to disk.""" from meshly.data_handler import DataHandler - - inner1 = InnerPackable( - label="same", - data=np.array([1.0, 2.0], dtype=np.float32) + from meshly.packable import CachedAssetLoader + + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) ) - inner2 = InnerPackable( - label="same", + + extracted = Packable.extract(original) + fetch_count = [0] # Use list to track calls in closure + + def counting_loader(checksum: str) -> bytes: + fetch_count[0] += 1 + return extracted.assets[checksum] + + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "cache" + cache_handler = DataHandler.create(cache_path) + + # First lazy model with CachedAssetLoader - should fetch from loader + loader1 = CachedAssetLoader(counting_loader, cache_handler) + lazy1 = Packable.reconstruct( + SimulationResult, extracted.data, loader1 + ) + + # Access temperature - should fetch and cache + _ = lazy1.temperature + assert fetch_count[0] == 1 + + # Access velocity - should fetch and cache + _ = lazy1.velocity + assert fetch_count[0] == 2 + + # Finalize to write cache + cache_handler.finalize() + + # Create new cache handler pointing to same location + cache_handler2 = DataHandler.create(cache_path) + + # Second lazy model with same cache - should read from cache + loader2 = CachedAssetLoader(counting_loader, cache_handler2) + lazy2 = Packable.reconstruct( + SimulationResult, extracted.data, loader2 + ) + + # Access both fields - should NOT call loader (reads from cache) + temp2 = lazy2.temperature + vel2 = lazy2.velocity + assert fetch_count[0] == 2, "Should read from cache, not call loader" + + # Verify data integrity + np.testing.assert_array_almost_equal(temp2, original.temperature) + np.testing.assert_array_almost_equal(vel2, original.velocity) + + +class TestNestedPackableRejection: + """Test that direct Packable fields are rejected, but nested in dicts is allowed.""" + + def test_direct_nested_packable_rejected(self): + """Test that a Packable field containing another Packable is rejected.""" + + class InnerPackable(Packable): + label: str + data: np.ndarray + + class OuterPackable(Packable): + name: str + inner: Optional[InnerPackable] = None + + inner = InnerPackable( + label="inner", data=np.array([1.0, 2.0], dtype=np.float32) ) - outer1 = OuterPackable(name="outer1", inner=inner1) - outer2 = OuterPackable(name="outer2", inner=inner2) + + with pytest.raises(TypeError, match="Direct Packable fields are not allowed"): + OuterPackable(name="outer", inner=inner) + + def test_dict_of_packables_allowed(self): + """Test that Dict[str, Packable] is allowed (Packable inside typed dict).""" + + class ContainerPackable(Packable): + name: str + items: Dict[str, SimpleData] = Field(default_factory=dict) + + inner = SimpleData( + name="inner", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + # Should be allowed with typed dict + container = ContainerPackable(name="container", items={"nested": inner}) + assert container.name == "container" + assert isinstance(container.items["nested"], SimpleData) + + def test_extract_typed_dict_with_nested_packables(self): + """Test that extract() handles typed dicts with nested Packables.""" + + class ContainerPackable(Packable): + name: str + items: Dict[str, SimpleData] = Field(default_factory=dict) + + inner = SimpleData( + name="inner", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + container = ContainerPackable(name="container", items={"nested": inner}) + + # Extract should create refs for the nested Packable + extracted = Packable.extract(container) + + # The nested packable should be a ref (no $type - schema provides type info) + assert "$ref" in extracted.data["items"]["nested"] + + # Should have asset for the nested packable + assert len(extracted.assets) >= 1 + + def test_reconstruct_typed_dict_with_nested_packables(self): + """Test that reconstruct() handles typed dicts with nested Packables.""" + + class ContainerPackable(Packable): + name: str + items: Dict[str, SimpleData] = Field(default_factory=dict) + + inner = SimpleData( + name="inner", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + container = ContainerPackable(name="container", items={"nested": inner}) + + # Extract and reconstruct + extracted = Packable.extract(container) + reconstructed = Packable.reconstruct(ContainerPackable, extracted.data, extracted.assets) + + assert reconstructed.name == "container" + assert isinstance(reconstructed.items["nested"], SimpleData) + assert reconstructed.items["nested"].name == "inner" + np.testing.assert_array_almost_equal( + reconstructed.items["nested"].values, inner.values + ) + + def test_none_nested_packable_allowed(self): + """Test that Optional[Packable] = None is allowed.""" + + class InnerPackable(Packable): + label: str + data: np.ndarray + + class OuterPackable(Packable): + name: str + inner: Optional[InnerPackable] = None + + # Should work with None + outer = OuterPackable(name="outer", inner=None) + assert outer.name == "outer" + assert outer.inner is None + + +class TestDataHandler: + """Test DataHandler functionality.""" + + def test_context_manager_file_handler(self): + """Test DataHandler can be used as context manager with FileHandler.""" + from meshly.data_handler import DataHandler with tempfile.TemporaryDirectory() as tmpdir: - cache_path = os.path.join(tmpdir, "cache") - zip1_path = os.path.join(tmpdir, "outer1.zip") - zip2_path = os.path.join(tmpdir, "outer2.zip") + with DataHandler.create(tmpdir) as handler: + handler.write_text("test.txt", "hello world") + assert handler.exists("test.txt") - cache_handler = DataHandler.create(cache_path) - outer1.save_to_zip(zip1_path, cache_handler=cache_handler) - outer2.save_to_zip(zip2_path, cache_handler=cache_handler) + # File should still exist after context exit + assert os.path.exists(os.path.join(tmpdir, "test.txt")) - cache_files = os.listdir(cache_path) - assert len(cache_files) == 1 + def test_context_manager_zip_handler(self): + """Test DataHandler can be used as context manager with ZipHandler.""" + from meshly.data_handler import DataHandler - read_cache_handler = DataHandler.create(cache_path) - loaded1 = OuterPackable.load_from_zip(zip1_path, cache_handler=read_cache_handler) - loaded2 = OuterPackable.load_from_zip(zip2_path, cache_handler=read_cache_handler) + buffer = BytesIO() + with DataHandler.create(buffer) as handler: + handler.write_text("metadata.json", '{"test": true}') + handler.write_binary("data.bin", b"binary content") - assert loaded1.inner.label == "same" - assert loaded2.inner.label == "same" + # After context exit, zip should be finalized and readable + buffer.seek(0) + with DataHandler.create(BytesIO(buffer.read())) as reader: + content = reader.read_text("metadata.json") + assert content == '{"test": true}' + assert reader.read_binary("data.bin") == b"binary content" - def test_cache_missing_falls_back_to_embedded(self): - """Test loading works when cache file is missing but data is embedded.""" + def test_remove_file(self): + """Test remove_file functionality for FileHandler.""" from meshly.data_handler import DataHandler - inner = InnerPackable( - label="fallback", - data=np.array([7.0, 8.0], dtype=np.float32) - ) - outer = OuterPackable(name="fallback_outer", inner=inner) + with tempfile.TemporaryDirectory() as tmpdir: + handler = DataHandler.create(tmpdir) + handler.write_text("to_delete.txt", "temporary") + assert handler.exists("to_delete.txt") - buffer = BytesIO() - outer.save_to_zip(buffer) + handler.remove_file("to_delete.txt") + assert not handler.exists("to_delete.txt") - with tempfile.TemporaryDirectory() as tmpdir: - cache_path = os.path.join(tmpdir, "cache") - os.makedirs(cache_path) - read_cache_handler = DataHandler.create(cache_path) - buffer.seek(0) - loaded = OuterPackable.load_from_zip(buffer, cache_handler=read_cache_handler) - - assert loaded.name == "fallback_outer" - assert loaded.inner.label == "fallback" + def test_remove_file_zip_raises(self): + """Test remove_file raises NotImplementedError for ZipHandler.""" + from meshly.data_handler import DataHandler + + buffer = BytesIO() + with DataHandler.create(buffer) as handler: + handler.write_text("test.txt", "content") + with pytest.raises(NotImplementedError): + handler.remove_file("test.txt") diff --git a/typescript/README.md b/typescript/README.md index ab72f02..f1f3249 100644 --- a/typescript/README.md +++ b/typescript/README.md @@ -18,6 +18,8 @@ pnpm add meshly - Support for polygon meshes with automatic triangulation - Marker extraction for boundary conditions and regions - Custom field decoding via `getCustomFields()` override +- **Reconstruct API** for resolving `$ref` asset references +- **CachedAssetLoader** for disk-cached asset loading - Full TypeScript type definitions ## Quick Start @@ -92,8 +94,6 @@ protected static override getCustomFields(): Record { ```typescript // Base metadata (matches Python PackableMetadata) interface PackableMetadata { - class_name: string - module_name: string field_data?: Record } @@ -198,14 +198,32 @@ const inletIndices = await Mesh.loadArray(zipData, 'markerIndices.inlet') interface DataHandler { // Read binary content from a file readBinary(path: string): Promise + // Write binary content to a file (optional) + writeBinary?(path: string, content: Uint8Array | ArrayBuffer): Promise // Check if a file exists (optional) exists?(path: string): Promise } -// Create a DataHandler from a hash loader function -function createDataHandler( - loader: (hash: string) => Promise -): DataHandler +// Asset fetch function type +type AssetFetcher = (checksum: string) => Promise + +// Asset provider: either a dict of assets or a fetcher function +type AssetProvider = Record | AssetFetcher +``` + +### CachedAssetLoader + +```typescript +// Asset loader with optional disk cache for persistence +class CachedAssetLoader { + constructor( + fetch: AssetFetcher, // Function that fetches asset bytes by checksum + cache: DataHandler // DataHandler for caching fetched assets + ) + + // Get asset bytes, checking cache first then fetching if needed + async getAsset(checksum: string): Promise +} ``` ### CustomFieldConfig @@ -228,11 +246,18 @@ interface CustomFieldConfig { constructor(data: TData) - // Decode from zip data (with optional cache handler for nested packables) - static async decode( - zipData: ArrayBuffer | Uint8Array, - cacheHandler?: DataHandler - ): Promise> + // Decode from zip data + static async decode(zipData: ArrayBuffer | Uint8Array): Promise> + + // Reconstruct from extracted data and assets + static async reconstruct( + data: Record, + assets: AssetProvider | CachedAssetLoader, + schema?: ReconstructSchema + ): Promise + + // Decode packed array format (metadata + data bytes) + static _decodePackedArray(packed: Uint8Array | ArrayBuffer): TypedArray // Load single array static async loadArray(zipData: ArrayBuffer | Uint8Array, name: string): Promise @@ -242,9 +267,6 @@ class Packable { // Custom field configuration (override in subclasses) protected static getCustomFields(): Record - - // Packable field types for nested packable decoding (override in subclasses) - protected static getPackableFieldTypes(): Record } ``` @@ -267,8 +289,8 @@ class Mesh extends Packable { isUniformPolygons(): boolean getPolygonIndices(): Uint32Array[] | Uint32Array - // Decoding (with optional cache handler for nested packables) - static async decode(zipData: ArrayBuffer | Uint8Array, cacheHandler?: DataHandler): Promise + // Decoding + static async decode(zipData: ArrayBuffer | Uint8Array): Promise // Marker extraction extractByMarker(markerName: string): Mesh @@ -280,6 +302,7 @@ class Mesh extends Packable { // Custom field configuration for meshoptimizer decoding protected static override getCustomFields(): Record> } +} ``` ### MeshData Interface @@ -303,10 +326,7 @@ interface MeshData { ```typescript // Base metadata for all Packable types interface PackableMetadata { - class_name: string - module_name: string field_data?: Record - packable_refs?: Record // SHA256 hash refs for cached packables } // Mesh-specific metadata extending base @@ -323,49 +343,53 @@ interface MeshSize { } ``` -### Cache Support - -When loading meshes with nested Packables that were saved with caching (using Python's `cache_handler`), provide a `DataHandler`: +### Reconstruct Schema Types ```typescript -import { Mesh, DataHandler, createDataHandler } from 'meshly' - -// Example: Fetch from server cache using createDataHandler helper -const cacheHandler = createDataHandler(async (hash) => { - const response = await fetch(`/cache/${hash}.zip`) - return response.ok ? response.arrayBuffer() : undefined -}) - -// Decode with cache support -const mesh = await Mesh.decode(zipData, cacheHandler) +// Decoder function for Packable types +type PackableDecoder = (data: Uint8Array | ArrayBuffer) => Promise | T + +// Schema for a single field +type FieldSchema = + | { type: 'array'; element?: FieldSchema } // TypedArray or Array of items + | { type: 'packable'; decode: PackableDecoder } // Nested Packable + | { type: 'dict'; value?: FieldSchema } // Dict with uniform value type + | { type: 'object'; fields?: ReconstructSchema } // Object with known field types + +// Schema mapping field names to their types +type ReconstructSchema = Record + +// Result of Python's Packable.extract() +interface SerializedPackableData { + data: Record // Serializable dict with $ref references + assets: Record // Map of checksum -> encoded bytes +} ``` -**DataHandler examples:** +### Reconstruct Example ```typescript -// From IndexedDB using createDataHandler -const idbHandler = createDataHandler(async (hash) => { - const db = await openDB('meshly-cache') - return db.get('packables', hash) -}) - -// From Map (in-memory) -const memoryCache = new Map() -const memoryHandler = createDataHandler(async (hash) => memoryCache.get(hash)) - -// Custom class implementing DataHandler interface -class ServerCacheHandler implements DataHandler { - constructor(private baseUrl: string) {} - - async readBinary(path: string): Promise { - const response = await fetch(`${this.baseUrl}/${path}`) - return response.ok ? response.arrayBuffer() : undefined +import { Packable, CachedAssetLoader, ReconstructSchema } from 'meshly' + +// Simple case - all $refs are arrays +const result = await Packable.reconstruct(data, assets) + +// With nested Packables - define schema for type hints +const schema: ReconstructSchema = { + mesh: { type: 'packable', decode: (bytes) => Mesh.decode(bytes) }, + snapshots: { + type: 'array', + element: { type: 'packable', decode: (bytes) => Mesh.decode(bytes) } } } - -const serverHandler = new ServerCacheHandler('/api/cache') -const mesh = await Mesh.decode(zipData, serverHandler) -``` +const result = await Packable.reconstruct(data, assets, schema) + +// With CachedAssetLoader for disk caching +const loader = new CachedAssetLoader( + async (checksum) => fetch(`/api/assets/${checksum}`).then(r => r.arrayBuffer()), + myDataHandler +) +const result = await Packable.reconstruct(data, loader, schema) ``` ### Utility Classes diff --git a/typescript/package.json b/typescript/package.json index 7d606d8..85d27d3 100644 --- a/typescript/package.json +++ b/typescript/package.json @@ -1,6 +1,6 @@ { "name": "meshly", - "version": "2.4.0-alpha", + "version": "2.5.0-alpha", "type": "commonjs", "description": "TypeScript library to decode Python meshoptimizer zip files into THREE.js geometries", "main": "dist/index.js", diff --git a/typescript/src/__tests__/loadArray.test.ts b/typescript/src/__tests__/loadArray.test.ts index 851a046..073acda 100644 --- a/typescript/src/__tests__/loadArray.test.ts +++ b/typescript/src/__tests__/loadArray.test.ts @@ -21,8 +21,6 @@ async function createTestMeshZip(): Promise { // Add metadata zip.file('metadata.json', JSON.stringify({ - class_name: 'Mesh', - module_name: 'meshly.mesh', mesh_size: { vertex_count: 3, vertex_size: 12, @@ -67,8 +65,6 @@ async function createTestMeshWithMarkersZip(): Promise { // Add metadata zip.file('metadata.json', JSON.stringify({ - class_name: 'Mesh', - module_name: 'meshly.mesh', mesh_size: { vertex_count: 4, vertex_size: 12, diff --git a/typescript/src/__tests__/reconstruct.test.ts b/typescript/src/__tests__/reconstruct.test.ts new file mode 100644 index 0000000..2bad2fb --- /dev/null +++ b/typescript/src/__tests__/reconstruct.test.ts @@ -0,0 +1,322 @@ +import { MeshoptEncoder } from 'meshoptimizer' +import { describe, expect, it } from 'vitest' +import { ArrayMetadata } from '../array' +import { AssetProvider, CachedAssetLoader, DataHandler } from '../data-handler' +import { Packable, ReconstructSchema, SerializedPackableData } from '../packable' + +/** + * Helper to encode an array using meshoptimizer and pack with metadata. + * Matches Python's packed array format: [4 bytes metadata length][metadata json][array data] + */ +async function packArray( + values: Float32Array | Uint32Array | Int32Array, + dtype: string +): Promise { + await MeshoptEncoder.ready + + const itemsize = values.BYTES_PER_ELEMENT + const count = values.length + const shape = [count] + + // Encode with meshoptimizer + const encoded = MeshoptEncoder.encodeVertexBuffer( + new Uint8Array(values.buffer, values.byteOffset, values.byteLength), + count, + itemsize + ) + + // Create metadata + const metadata: ArrayMetadata = { shape, dtype, itemsize } + const metadataJson = JSON.stringify(metadata) + const metadataBytes = new TextEncoder().encode(metadataJson) + + // Pack: [4 bytes len][metadata][data] + const packed = new Uint8Array(4 + metadataBytes.length + encoded.length) + const view = new DataView(packed.buffer) + view.setUint32(0, metadataBytes.length, true) // little-endian + packed.set(metadataBytes, 4) + packed.set(encoded, 4 + metadataBytes.length) + + return packed +} + +/** + * Simple SHA256 hash (first 16 chars) for deterministic checksums + */ +async function sha256(data: Uint8Array): Promise { + const hashBuffer = await crypto.subtle.digest('SHA-256', data) + const hashArray = Array.from(new Uint8Array(hashBuffer)) + return hashArray.map((b) => b.toString(16).padStart(2, '0')).join('').slice(0, 16) +} + +/** + * Helper to create extracted data format (simulating Python's Packable.extract output) + */ +async function createExtractedData( + fields: Record, + arrays: Record +): Promise { + const data: Record = { ...fields } + const assets: Record = {} + + for (const [name, values] of Object.entries(arrays)) { + const dtype = values instanceof Float32Array ? 'float32' : 'uint32' + const packed = await packArray(values, dtype) + const checksum = await sha256(packed) + data[name] = { $ref: checksum } + assets[checksum] = packed + } + + return { data, assets } +} + +describe('Packable.reconstruct', () => { + describe('with dict assets (eager loading)', () => { + it('reconstructs simple data with arrays', async () => { + const extracted = await createExtractedData( + { name: 'test', time: 0.5 }, + { + temperature: new Float32Array([300.0, 301.0, 302.0]), + velocity: new Float32Array([1.0, 0.0, 0.0, 1.0]) + } + ) + + const result = await Packable.reconstruct<{ + name: string + time: number + temperature: Float32Array + velocity: Float32Array + }>(extracted.data, extracted.assets) + + expect(result.name).toBe('test') + expect(result.time).toBe(0.5) + expect(result.temperature).toBeInstanceOf(Float32Array) + expect(Array.from(result.temperature)).toEqual([300.0, 301.0, 302.0]) + expect(Array.from(result.velocity)).toEqual([1.0, 0.0, 0.0, 1.0]) + }) + + it('throws KeyError for missing asset', async () => { + const data = { name: 'test', values: { $ref: 'nonexistent_checksum' } } + + await expect(Packable.reconstruct(data, {})).rejects.toThrow( + /Missing asset.*nonexistent_checksum/ + ) + }) + + it('preserves primitive fields unchanged', async () => { + const data = { + name: 'simulation_001', + time: 1.5, + active: true, + config: { iterations: 100, tolerance: 1e-6 } + } + + const result = await Packable.reconstruct(data, {}) + + expect(result).toEqual(data) + }) + + it('handles nested objects with refs', async () => { + const tempArray = new Float32Array([100.0, 200.0]) + const tempPacked = await packArray(tempArray, 'float32') + const tempChecksum = await sha256(tempPacked) + + const data = { + name: 'nested', + fields: { + temperature: { $ref: tempChecksum } + } + } + + const result = await Packable.reconstruct(data, { [tempChecksum]: tempPacked }) + + expect(result.name).toBe('nested') + expect(result.fields.temperature).toBeInstanceOf(Float32Array) + expect(Array.from(result.fields.temperature as Float32Array)).toEqual([100.0, 200.0]) + }) + + it('handles arrays of objects with refs', async () => { + const temp1 = new Float32Array([100.0]) + const temp2 = new Float32Array([200.0]) + const packed1 = await packArray(temp1, 'float32') + const packed2 = await packArray(temp2, 'float32') + const checksum1 = await sha256(packed1) + const checksum2 = await sha256(packed2) + + const data = { + snapshots: [ + { time: 0.0, temperature: { $ref: checksum1 } }, + { time: 1.0, temperature: { $ref: checksum2 } } + ] + } + + type Snapshot = { time: number; temperature: Float32Array } + const result = await Packable.reconstruct<{ snapshots: Snapshot[] }>(data, { + [checksum1]: packed1, + [checksum2]: packed2 + }) + + expect(result.snapshots).toHaveLength(2) + expect(result.snapshots[0].time).toBe(0.0) + expect(Array.from(result.snapshots[0].temperature)).toEqual([100.0]) + expect(result.snapshots[1].time).toBe(1.0) + expect(Array.from(result.snapshots[1].temperature)).toEqual([200.0]) + }) + }) + + describe('with callable assets (lazy loading)', () => { + it('defers loading until field access', async () => { + const extracted = await createExtractedData( + { name: 'lazy_test', time: 0.5 }, + { temperature: new Float32Array([300.0, 301.0]) } + ) + + const requestedChecksums: string[] = [] + const loader: AssetProvider = async (checksum: string) => { + requestedChecksums.push(checksum) + return extracted.assets[checksum] + } + + // Access primitive field only + const result = await Packable.reconstruct(extracted.data, loader) + + // Primitive field should be available + expect(result.name).toBe('lazy_test') + expect(result.time).toBe(0.5) + + // Array should have been fetched (TypeScript version is eager with callable too) + // Note: Unlike Python's LazyModel, TS reconstruct resolves all refs + expect(requestedChecksums.length).toBe(1) + }) + + it('throws when callable returns missing asset', async () => { + const data = { values: { $ref: 'missing' } } + const failingLoader: AssetProvider = async () => { + throw new Error("Missing asset with checksum 'missing'") + } + + await expect(Packable.reconstruct(data, failingLoader)).rejects.toThrow(/Missing asset/) + }) + }) + + describe('with schema for Packables', () => { + it('decodes nested Packable using schema decoder', async () => { + // Create a mock "packable" that's just raw bytes representing a simple object + const mockPackableBytes = new TextEncoder().encode(JSON.stringify({ type: 'mock', value: 42 })) + const checksum = await sha256(mockPackableBytes) + + // Custom decoder that parses the JSON + const mockDecoder = (data: Uint8Array | ArrayBuffer) => { + const bytes = data instanceof Uint8Array ? data : new Uint8Array(data) + return JSON.parse(new TextDecoder().decode(bytes)) + } + + const data = { + name: 'with_nested', + nested: { $ref: checksum } + } + + const schema: ReconstructSchema = { + nested: { type: 'packable', decode: mockDecoder } + } + + const result = await Packable.reconstruct(data, { [checksum]: mockPackableBytes }, schema) + + expect(result.name).toBe('with_nested') + expect(result.nested).toEqual({ type: 'mock', value: 42 }) + }) + + it('handles array of Packables with element schema', async () => { + const item1Bytes = new TextEncoder().encode(JSON.stringify({ id: 1 })) + const item2Bytes = new TextEncoder().encode(JSON.stringify({ id: 2 })) + const checksum1 = await sha256(item1Bytes) + const checksum2 = await sha256(item2Bytes) + + const mockDecoder = (data: Uint8Array | ArrayBuffer) => { + const bytes = data instanceof Uint8Array ? data : new Uint8Array(data) + return JSON.parse(new TextDecoder().decode(bytes)) + } + + const data = { + items: [{ $ref: checksum1 }, { $ref: checksum2 }] + } + + const schema: ReconstructSchema = { + items: { + type: 'array', + element: { type: 'packable', decode: mockDecoder } + } + } + + const result = await Packable.reconstruct(data, { + [checksum1]: item1Bytes, + [checksum2]: item2Bytes + }, schema) + + expect(result.items).toHaveLength(2) + expect(result.items[0]).toEqual({ id: 1 }) + expect(result.items[1]).toEqual({ id: 2 }) + }) + }) + + describe('_decodePackedArray', () => { + it('decodes packed array format correctly', async () => { + const original = new Float32Array([1.0, 2.0, 3.0, 4.0]) + const packed = await packArray(original, 'float32') + + const decoded = Packable._decodePackedArray(packed) + + expect(decoded).toBeInstanceOf(Float32Array) + expect(Array.from(decoded as Float32Array)).toEqual([1.0, 2.0, 3.0, 4.0]) + }) + + it('handles uint32 arrays', async () => { + const original = new Uint32Array([10, 20, 30]) + const packed = await packArray(original, 'uint32') + + const decoded = Packable._decodePackedArray(packed) + + expect(decoded).toBeInstanceOf(Uint32Array) + expect(Array.from(decoded as Uint32Array)).toEqual([10, 20, 30]) + }) + }) +}) + +describe('CachedAssetLoader', () => { + it('caches fetched assets', async () => { + const extracted = await createExtractedData({}, { values: new Float32Array([1.0, 2.0]) }) + const checksum = Object.keys(extracted.assets)[0] + + let fetchCount = 0 + const fetcher = async (c: string) => { + fetchCount++ + return extracted.assets[c] + } + + // Create a simple in-memory cache + const cache: Record = {} + const mockHandler: DataHandler = { + async readBinary(path: string) { + return cache[path] + }, + async writeBinary(path: string, content: Uint8Array | ArrayBuffer) { + cache[path] = content instanceof Uint8Array ? content : new Uint8Array(content) + }, + async exists(path: string) { + return path in cache + } + } + + const loader = new CachedAssetLoader(fetcher, mockHandler) + + // First fetch - should call fetcher + const result1 = await loader.getAsset(checksum) + expect(fetchCount).toBe(1) + expect(result1).toBeDefined() + + // Second fetch - should use cache + const result2 = await loader.getAsset(checksum) + expect(fetchCount).toBe(1) // Still 1, cached + expect(result2).toBeDefined() + }) +}) diff --git a/typescript/src/data-handler.ts b/typescript/src/data-handler.ts index f63427b..bcf164b 100644 --- a/typescript/src/data-handler.ts +++ b/typescript/src/data-handler.ts @@ -15,6 +15,13 @@ export interface DataHandler { */ readBinary(path: string): Promise + /** + * Write binary content to a file. + * @param path - File path + * @param content - Content to write + */ + writeBinary?(path: string, content: Uint8Array | ArrayBuffer): Promise + /** * Check if a file exists. * @param path - File path @@ -24,17 +31,91 @@ export interface DataHandler { } /** - * Create a DataHandler from a simple hash loader function. - * Provides backward compatibility for function-based loaders. + * Asset fetch function type - takes a checksum and returns asset bytes + */ +export type AssetFetcher = (checksum: string) => Promise + +/** + * Asset provider: either a dict of assets or a fetcher function */ -export function createDataHandler( - loader: (hash: string) => Promise -): DataHandler { - return { - readBinary: async (path: string) => { - // Extract hash from path (e.g., "abc123.zip" -> "abc123") - const hash = path.replace(/\.zip$/, '') - return loader(hash) +export type AssetProvider = Record | AssetFetcher + +/** + * Asset loader with optional disk cache for persistence. + * + * Wraps a fetch function with a DataHandler for caching. + * Fetched assets are stored as 'assets/{checksum}.bin' and read + * from cache on subsequent access. + * + * @example + * ```ts + * const loader = new CachedAssetLoader( + * async (checksum) => await fetch(`/api/assets/${checksum}`).then(r => r.arrayBuffer()), + * myDataHandler + * ) + * const model = await Packable.reconstruct(data, loader) + * ``` + */ +export class CachedAssetLoader { + constructor( + /** Function that fetches asset bytes by checksum */ + public readonly fetch: AssetFetcher, + /** DataHandler for caching fetched assets */ + public readonly cache: DataHandler + ) { } + + /** + * Get asset bytes, checking cache first then fetching if needed. + */ + async getAsset(checksum: string): Promise { + const cachePath = `assets/${checksum}.bin` + + // Try cache first + if (this.cache.exists) { + const exists = await this.cache.exists(cachePath) + if (exists) { + const cached = await this.cache.readBinary(cachePath) + if (cached) return cached + } + } else { + // No exists method, try read directly + const cached = await this.cache.readBinary(cachePath) + if (cached) return cached } + + // Fetch from source + const fetched = await this.fetch(checksum) + + // Cache for next time + if (this.cache.writeBinary) { + const data = fetched instanceof Uint8Array ? fetched : new Uint8Array(fetched) + await this.cache.writeBinary(cachePath, data) + } + + return fetched + } +} + + +/** + * Helper to get asset bytes from an AssetProvider + */ +export async function getAsset( + assets: AssetProvider | CachedAssetLoader, + checksum: string +): Promise { + if (assets instanceof CachedAssetLoader) { + return assets.getAsset(checksum) + } + + if (typeof assets === 'function') { + return assets(checksum) + } + + // Dict lookup + const asset = assets[checksum] + if (!asset) { + throw new Error(`Missing asset with checksum '${checksum}'`) } + return asset } diff --git a/typescript/src/index.ts b/typescript/src/index.ts index c387376..9e61841 100644 --- a/typescript/src/index.ts +++ b/typescript/src/index.ts @@ -9,12 +9,22 @@ export { CustomDecoder, CustomFieldConfig, + FieldSchema, Packable, - PackableMetadata + PackableDecoder, + PackableMetadata, + ReconstructSchema, + SerializedPackableData } from './packable' // Export from data-handler module -export { DataHandler, createDataHandler } from './data-handler' +export { + AssetFetcher, + AssetProvider, + CachedAssetLoader, + DataHandler, + getAsset +} from './data-handler' // Export from array module export { ArrayMetadata, ArrayType, ArrayUtils, EncodedArray } from './array' diff --git a/typescript/src/packable.ts b/typescript/src/packable.ts index e429005..526929a 100644 --- a/typescript/src/packable.ts +++ b/typescript/src/packable.ts @@ -7,8 +7,8 @@ */ import JSZip from "jszip" -import { ArrayUtils, TypedArray } from "./array" -import { DataHandler } from "./data-handler" +import { ArrayMetadata, ArrayUtils, EncodedArray, TypedArray } from "./array" +import { AssetProvider, CachedAssetLoader, getAsset } from "./data-handler" /** @@ -16,14 +16,8 @@ import { DataHandler } from "./data-handler" * Uses snake_case to match Python serialization format. */ export interface PackableMetadata { - /** Name of the class that created this data */ - class_name: string - /** Module where the class is defined */ - module_name: string /** Non-array field values */ field_data?: Record - /** SHA256 hash references for cached packable fields (field_name -> hash) */ - packable_refs?: Record } /** @@ -85,7 +79,8 @@ export class Packable { * Get custom field configurations for this class. * Subclasses override this to define custom decoders. */ - protected static getCustomFields(): Record { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + protected static getCustomFields(): Record> { return {} } @@ -117,89 +112,6 @@ export class Packable { } } - // ============================================================ - // Packable field handling - // ============================================================ - - /** - * Get packable field types for this class. - * Subclasses override this to declare nested Packable fields. - * Returns a map of field names to their Packable subclass constructors. - */ - protected static getPackableFieldTypes(): Record { - return {} - } - - /** - * Get the set of packable field names - */ - protected static getPackableFieldNames(): Set { - return new Set(Object.keys(this.getPackableFieldTypes())) - } - - /** - * Decode packable fields from the zip or cache. - * - * Supports both embedded packables (in packables/ folder) and cached - * packables (referenced by SHA256 hash in metadata.packable_refs). - */ - protected static async decodePackableFields( - zip: JSZip, - metadata: PackableMetadata, - data: Record, - cacheHandler?: DataHandler - ): Promise { - const packableFieldTypes = this.getPackableFieldTypes() - const loadedFields = new Set() - - // First, try to load from cache using hash refs - if (cacheHandler && metadata.packable_refs) { - for (const [fieldName, hash] of Object.entries(metadata.packable_refs)) { - const PackableClass = packableFieldTypes[fieldName] - if (!PackableClass) continue - - try { - const cachedData = await cacheHandler.readBinary(`${hash}.zip`) - if (cachedData) { - // Use the specific subclass's decode method with cache support - data[fieldName] = await PackableClass.decode(cachedData, cacheHandler) - loadedFields.add(fieldName) - } - } catch { - // Not in cache, will try embedded - } - } - } - - // Then load any embedded packables (for backward compatibility or no-cache case) - const packablesFolder = zip.folder("packables") - if (!packablesFolder) return - - const packableFiles: string[] = [] - packablesFolder.forEach((relativePath, file) => { - if (relativePath.endsWith(".zip") && !file.dir) { - packableFiles.push(relativePath) - } - }) - - for (const relativePath of packableFiles) { - // Extract field name: "inner_mesh.zip" -> "inner_mesh" - const fieldName = relativePath.slice(0, -4) - - // Skip if already loaded from cache - if (loadedFields.has(fieldName)) continue - - const PackableClass = packableFieldTypes[fieldName] - if (!PackableClass) continue - - const file = packablesFolder.file(relativePath) - if (file) { - const encodedBytes = await file.async('arraybuffer') - data[fieldName] = await PackableClass.decode(encodedBytes, cacheHandler) - } - } - } - // ============================================================ // Standard array loading // ============================================================ @@ -260,21 +172,15 @@ export class Packable { * Decode a Packable from zip data. * * @param zipData - Zip file bytes - * @param cacheHandler - Optional DataHandler to load cached packables by SHA256 hash. - * When provided and metadata contains packable_refs, - * nested packables are loaded from cache. * * Subclasses can override this to handle custom field decoding. */ static async decode( - zipData: ArrayBuffer | Uint8Array, - cacheHandler?: DataHandler + zipData: ArrayBuffer | Uint8Array ): Promise> { const zip = await JSZip.loadAsync(zipData) const metadata = await Packable.loadMetadata(zip) const customFieldNames = this.getCustomFieldNames() - const packableFieldNames = this.getPackableFieldNames() - const skipFields = new Set([...customFieldNames, ...packableFieldNames]) const data: Record = {} @@ -282,10 +188,7 @@ export class Packable { await this.decodeCustomFields(zip, metadata, data) // Load standard arrays - await this.loadStandardArrays(zip, data, skipFields) - - // Decode packable fields - await this.decodePackableFields(zip, metadata, data, cacheHandler) + await this.loadStandardArrays(zip, data, customFieldNames) // Merge non-array fields from metadata if (metadata.field_data) { @@ -307,11 +210,6 @@ export class Packable { fieldData: Record ): void { for (const [key, value] of Object.entries(fieldData)) { - // Skip Python BaseModel reconstruction metadata - if (key === "__model_class__" || key === "__model_module__") { - continue - } - const existing = data[key] if ( @@ -326,33 +224,12 @@ export class Packable { existing as Record, value as Record ) - } else if (typeof value === "object" && value !== null && !ArrayBuffer.isView(value)) { - // Value is an object that might contain Python metadata - clean it - data[key] = Packable._stripModelMetadata(value as Record) } else { data[key] = value } } } - /** - * Recursively strip Python BaseModel metadata keys from an object. - */ - private static _stripModelMetadata(obj: Record): Record { - const result: Record = {} - for (const [key, value] of Object.entries(obj)) { - if (key === "__model_class__" || key === "__model_module__") { - continue - } - if (typeof value === "object" && value !== null && !ArrayBuffer.isView(value)) { - result[key] = Packable._stripModelMetadata(value as Record) - } else { - result[key] = value - } - } - return result - } - /** * Load a single array from a zip file without loading the entire object. */ @@ -363,4 +240,184 @@ export class Packable { const zip = await JSZip.loadAsync(zipData) return ArrayUtils.loadArray(zip, name) } + + // ============================================================ + // Extract / Reconstruct for content-addressable storage + // ============================================================ + + /** + * Decode a packed array asset (metadata + data bytes) to a TypedArray. + * + * Format: [4 bytes metadata length][metadata json][array data] + */ + static _decodePackedArray(packed: Uint8Array | ArrayBuffer): TypedArray { + const bytes = packed instanceof Uint8Array ? packed : new Uint8Array(packed) + + // Read metadata length (4 bytes little-endian) + const metadataLen = bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24) + + // Parse metadata JSON + const metadataJson = new TextDecoder().decode(bytes.slice(4, 4 + metadataLen)) + const metadata: ArrayMetadata = JSON.parse(metadataJson) + + // Get array data + const arrayData = bytes.slice(4 + metadataLen) + + const encoded: EncodedArray = { data: arrayData, metadata } + return ArrayUtils.decodeArray(encoded) + } + + /** + * Reconstruct a data object from extracted data and assets. + * + * Since TypeScript doesn't have runtime type information like Python's Pydantic, + * this provides a simpler approach: + * - Resolves $ref references to arrays or nested Packables + * - Uses the optional `schema` to determine which refs are Packables vs arrays + * + * @param data - The data dict from extract(), with $ref references + * @param assets - Asset provider (dict, function, or CachedAssetLoader) + * @param schema - Optional schema defining which fields are Packables + * @returns Reconstructed data object with resolved references + * + * @example + * ```ts + * // Simple case - all $refs are arrays + * const rebuilt = await Packable.reconstruct(data, assets) + * + * // With nested Packables - define schema + * const schema: ReconstructSchema = { + * mesh: { type: 'packable', decode: Mesh.decode }, + * snapshots: { + * type: 'array', + * element: { + * mesh: { type: 'packable', decode: Mesh.decode } + * } + * } + * } + * const rebuilt = await Packable.reconstruct(data, assets, schema) + * ``` + */ + static async reconstruct>( + data: Record, + assets: AssetProvider | CachedAssetLoader, + schema?: ReconstructSchema + ): Promise { + const result: Record = {} + + for (const [key, value] of Object.entries(data)) { + const fieldSchema = schema?.[key] + result[key] = await Packable._resolveValue(value, assets, fieldSchema) + } + + return result as T + } + + /** + * Resolve a single value, handling $ref, nested objects, and arrays. + */ + private static async _resolveValue( + value: unknown, + assets: AssetProvider | CachedAssetLoader, + schema?: FieldSchema + ): Promise { + if (value === null || value === undefined) { + return value + } + + // Handle $ref references + if (isRefObject(value)) { + const checksum = value.$ref + const assetBytes = await getAsset(assets, checksum) + const bytes = assetBytes instanceof Uint8Array ? assetBytes : new Uint8Array(assetBytes) + + // Use schema to determine type, default to array + if (schema?.type === 'packable' && schema.decode) { + return schema.decode(bytes) + } + + // Default: decode as array + return Packable._decodePackedArray(bytes) + } + + // Handle arrays (JS arrays, not TypedArrays) + if (Array.isArray(value)) { + const elementSchema = schema?.type === 'array' ? schema.element : undefined + return Promise.all( + value.map(v => Packable._resolveValue(v, assets, elementSchema)) + ) + } + + // Handle nested objects + if (typeof value === 'object' && !ArrayBuffer.isView(value)) { + const obj = value as Record + const result: Record = {} + + for (const [k, v] of Object.entries(obj)) { + // Skip Python model metadata + if (k === '__model_class__' || k === '__model_module__') continue + + // Get nested schema if this is a dict schema + const nestedSchema = schema?.type === 'dict' ? schema.value : + schema?.type === 'object' ? schema.fields?.[k] : undefined + result[k] = await Packable._resolveValue(v, assets, nestedSchema) + } + + return result + } + + // Primitive - return as-is + return value + } +} + + +// ============================================================ +// Reconstruct Schema Types +// ============================================================ + +/** + * Reference object with $ref checksum + */ +interface RefObject { + $ref: string +} + +function isRefObject(value: unknown): value is RefObject { + return typeof value === 'object' && value !== null && '$ref' in value +} + +/** + * Decoder function for Packable types + */ +export type PackableDecoder = (data: Uint8Array | ArrayBuffer) => Promise | T + +/** + * Schema for a single field in reconstruct + */ +export type FieldSchema = + | { type: 'array'; element?: FieldSchema } // TypedArray or Array of items + | { type: 'packable'; decode: PackableDecoder } // Nested Packable + | { type: 'dict'; value?: FieldSchema } // Dict with uniform value type + | { type: 'object'; fields?: ReconstructSchema } // Object with known field types + +/** + * Schema mapping field names to their types for reconstruction. + * + * Without runtime type information, TypeScript needs hints to know + * which $ref values are Packables vs arrays. + */ +export type ReconstructSchema = Record + +/** + * Result of extracting a Packable for serialization. + * + * Contains the serializable data dict with checksum references, + * plus the encoded assets (arrays as bytes). + */ +export interface SerializedPackableData { + /** Serializable dict with primitive fields and checksum refs for arrays */ + data: Record + /** Map of checksum -> encoded bytes for all arrays */ + assets: Record }