Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Session serialization to zipped yaml/tsv #149

Merged
merged 7 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions doc/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,22 @@ computer:
1. Graphical User Interface (GUI)
=================================

C-COMPASS allows you to save and load your sessions via the main toolbar.

A session can be saved as a NumPy (``.npy``) file, which includes all datasets,
marker lists, settings, analyses, trainings, and statistics. These will be
fully restored upon loading.
C-COMPASS allows you to save and load your sessions via the main menu
(:menuselection:`File --> Save As`).
Saving after each significant step is recommended to avoid data loss.
The session file, which includes all datasets, marker lists, settings,
analyses, trainings, and statistics. These will be fully restored upon loading
(:menuselection:`File --> Open`).

There are currently two options for saving your session:

* A **NumPy/pickly** (``.npy``) file. This is the fastest option.
However, those files will not necessarily work across different versions
of Python, C-COMPASS, numpy, or pandas.
* A **zip** (``.ccompass``) file. This is significantly slower but more
reliable across different versions.

The format can be chosen in the save dialog.

2. Before training
==================
Expand Down
133 changes: 133 additions & 0 deletions src/ccompass/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

import copy
import logging
import tempfile
import uuid
import zipfile
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Literal

import numpy as np
import pandas as pd
import yaml
from pydantic import (
BaseModel,
ConfigDict,
Expand Down Expand Up @@ -725,6 +729,135 @@ def from_numpy(cls, filepath: Path | str):
data = np.load(f, allow_pickle=True).item()
return cls(**data)

def to_zip(self, filepath: Path | str):
"""Serialize the model to a zip file with YAML, TSV, and numpy files."""
filepath = Path(filepath)
filepath.parent.mkdir(parents=True, exist_ok=True)

with tempfile.TemporaryDirectory() as temp_dir:
temp_dir = Path(temp_dir)

def dataframe_representer(dumper, data: pd.DataFrame):
"""Custom YAML representer for pandas DataFrames."""
if data.empty:
return dumper.represent_scalar("!pandas.DataFrame", "")

file_id = str(uuid.uuid4())
file_path = temp_dir / f"{file_id}.tsv"
data.to_csv(file_path, sep="\t", index=True)
return dumper.represent_scalar(
"!pandas.DataFrame", file_path.name
)

def ndarray_representer(dumper, data):
"""Custom YAML representer for numpy arrays."""
file_id = str(uuid.uuid4())
file_path = temp_dir / f"{file_id}.npy"
np.save(file_path, data, allow_pickle=False)
return dumper.represent_scalar(
"!numpy.ndarray", file_path.name
)

def series_representer(dumper, data):
"""Custom YAML representer for pandas Series."""
file_id = str(uuid.uuid4())
file_path = temp_dir / f"{file_id}.tsv"
data.to_csv(file_path, sep="\t", index=True)
return dumper.represent_scalar(
"!pandas.Series", file_path.name
)

def float64_representer(dumper, data):
return dumper.represent_float(float(data))

def tuple_representer(dumper, data):
return dumper.represent_sequence("!tuple", data)

yaml.add_representer(
np.float64, float64_representer, Dumper=yaml.SafeDumper
)
yaml.add_representer(
pd.DataFrame, dataframe_representer, Dumper=yaml.SafeDumper
)
yaml.add_representer(
np.ndarray, ndarray_representer, Dumper=yaml.SafeDumper
)
yaml.add_representer(
pd.Series, series_representer, Dumper=yaml.SafeDumper
)
yaml.add_representer(
tuple, tuple_representer, Dumper=yaml.SafeDumper
)

with open(temp_dir / "session.yaml", "w") as f:
yaml.safe_dump(self.model_dump(), f)

with zipfile.ZipFile(
filepath, "w", compression=zipfile.ZIP_DEFLATED
) as zipf:
for item in temp_dir.iterdir():
zipf.write(item, item.name)

@classmethod
def from_zip(cls, filepath: Path | str):
"""Deserialize the model from a zip file with YAML and TSV files."""
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir = Path(temp_dir)

with zipfile.ZipFile(filepath, "r") as zipf:
zipf.extractall(temp_dir)

def dataframe_constructor(loader, node):
"""Custom YAML constructor for pandas DataFrames."""
if not (filename := loader.construct_scalar(node)):
return pd.DataFrame()

file_path = temp_dir / filename
try:
return pd.read_csv(file_path, sep="\t", index_col=0)
except pd.errors.EmptyDataError:
return pd.DataFrame()

def ndarray_constructor(loader, node):
"""Custom YAML constructor for numpy arrays."""
file_path = temp_dir / loader.construct_scalar(node)
return np.load(file_path, allow_pickle=False)

def series_constructor(loader, node):
"""Custom YAML constructor for pandas Series."""
file_path = temp_dir / loader.construct_scalar(node)
df = pd.read_csv(
file_path,
sep="\t",
index_col=0,
header=0,
float_precision="round_trip",
)
assert df.shape[1] == 1
return df.iloc[:, 0]

def tuple_constructor(loader, node):
return tuple(loader.construct_sequence(node))

yaml.add_constructor(
"!pandas.DataFrame",
dataframe_constructor,
Loader=yaml.SafeLoader,
)
yaml.add_constructor(
"!numpy.ndarray", ndarray_constructor, Loader=yaml.SafeLoader
)
yaml.add_constructor(
"!pandas.Series", series_constructor, Loader=yaml.SafeLoader
)
yaml.add_constructor(
"!tuple", tuple_constructor, Loader=yaml.SafeLoader
)

with open(temp_dir / "session.yaml") as f:
data = yaml.safe_load(f)
return cls(**data)


def write_global_changes_reports(
comparison: dict[tuple[ConditionId, ConditionId], ComparisonModel],
Expand Down
20 changes: 16 additions & 4 deletions src/ccompass/main_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -1583,7 +1583,10 @@ def _handle_session_open(self):
"Open Session",
initial_folder=str(self.app_settings.last_session_dir),
no_window=True,
file_types=(("Numpy", "*.npy"),),
file_types=(
("Numpy", "*.npy"),
("C-COMPASS zip", "*.ccompass"),
),
)
if not filename:
return
Expand Down Expand Up @@ -1621,7 +1624,10 @@ def _handle_session_save(self):
filename = sg.popup_get_file(
"Save Session",
no_window=True,
file_types=(("Numpy", "*.npy"),),
file_types=(
("Numpy", "*.npy"),
("C-COMPASS zip", "*.ccompass"),
),
save_as=True,
initial_folder=str(self.app_settings.last_session_dir),
)
Expand All @@ -1633,7 +1639,10 @@ def _handle_session_save(self):
self.app_settings.save()

with wait_cursor(self.main_window):
self.model.to_numpy(filename)
if str(filename).endswith(".ccompass"):
self.model.to_zip(filename)
else:
self.model.to_numpy(filename)

self._update_recent_files()

Expand Down Expand Up @@ -2250,7 +2259,10 @@ def marker_setclass(values, marker_sets):
def session_open(window: sg.Window, filename: str, model: SessionModel):
"""Read session data from file and update the window."""
# Update session data
tmp_session = SessionModel.from_numpy(filename)
if filename.endswith(".ccompass"):
tmp_session = SessionModel.from_zip(filename)
else:
tmp_session = SessionModel.from_numpy(filename)
model.reset(tmp_session)

# update GUI
Expand Down
11 changes: 11 additions & 0 deletions tests/test_full_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path

import numpy as np
from test_session_model import assert_session_equal

from ccompass._testing.synthetic_data import (
SyntheticDataConfig,
Expand Down Expand Up @@ -201,3 +202,13 @@ def test_full():

...
sess.to_numpy(Path(__file__).parent / "session_test_full.npy")
sess.to_zip(Path(__file__).parent / "session_test_full.ccompass")

sess2 = SessionModel.from_numpy(
Path(__file__).parent / "session_test_full.npy"
)
assert_session_equal(sess, sess2)
sess2 = SessionModel.from_zip(
Path(__file__).parent / "session_test_full.ccompass"
)
assert_session_equal(sess, sess2)
37 changes: 35 additions & 2 deletions tests/test_session_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from pathlib import Path
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
import pydantic

from ccompass.main_gui import SessionModel

Expand All @@ -20,8 +22,13 @@ def test_serialization():
assert_session_equal(session, session2)


def assert_equal(obj1, obj2):
def assert_equal(obj1, obj2) -> None:
"""Check if two objects are equal."""
if isinstance(obj1, pydantic.BaseModel):
assert isinstance(obj2, pydantic.BaseModel)
assert_equal(obj1.model_dump(), obj2.model_dump())
return

if isinstance(obj1, dict):
for key in obj1:
assert key in obj2
Expand All @@ -30,7 +37,20 @@ def assert_equal(obj1, obj2):
for i in range(len(obj1)):
assert_equal(obj1[i], obj2[i])
elif isinstance(obj1, pd.DataFrame):
pd.testing.assert_frame_equal(obj1, obj2)
assert isinstance(obj2, pd.DataFrame)
if obj1.empty and obj2.empty:
# if both are empty, we don't compare columns
return
pd.testing.assert_frame_equal(obj1, obj2, check_dtype=False)
elif isinstance(obj1, pd.Series):
assert isinstance(obj2, pd.Series), f"{obj1} != {obj2}"
pd.testing.assert_series_equal(
obj1, obj2, atol=1e-14, rtol=1e-14, check_dtype=False
)
elif isinstance(obj1, np.ndarray):
np.testing.assert_almost_equal(obj1, obj2)
elif isinstance(obj1, float) and pd.isna(obj1):
assert pd.isna(obj2)
else:
assert obj1 == obj2

Expand All @@ -42,3 +62,16 @@ def assert_session_equal(session, session2):
assert_equal(getattr(session, attr), getattr(session2, attr))
for attr in session2.__dict__:
assert attr in session.__dict__


def test_serialize_zip():
"""Test serialization of SessionModel to zip."""
session = SessionModel()

# round trip
with TemporaryDirectory() as tempdir:
fpath = Path(tempdir, "session.zip")
session.to_zip(fpath)
session2 = SessionModel.from_zip(fpath)

assert_session_equal(session, session2)