diff --git a/doc/usage.rst b/doc/usage.rst index 681cb93..3be9a97 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -207,11 +207,22 @@ computer: 1. Graphical User Interface (GUI) ================================= -C-COMPASS allows you to save and load your sessions via the main toolbar. - -A session can be saved as a NumPy (``.npy``) file, which includes all datasets, -marker lists, settings, analyses, trainings, and statistics. These will be -fully restored upon loading. +C-COMPASS allows you to save and load your sessions via the main menu +(:menuselection:`File --> Save As`). +Saving after each significant step is recommended to avoid data loss. +The session file, which includes all datasets, marker lists, settings, +analyses, trainings, and statistics. These will be fully restored upon loading +(:menuselection:`File --> Open`). + +There are currently two options for saving your session: + +* A **NumPy/pickly** (``.npy``) file. This is the fastest option. + However, those files will not necessarily work across different versions + of Python, C-COMPASS, numpy, or pandas. +* A **zip** (``.ccompass``) file. This is significantly slower but more + reliable across different versions. + +The format can be chosen in the save dialog. 2. Before training ================== diff --git a/src/ccompass/core.py b/src/ccompass/core.py index e1d1d3e..91b8695 100644 --- a/src/ccompass/core.py +++ b/src/ccompass/core.py @@ -4,12 +4,16 @@ import copy import logging +import tempfile +import uuid +import zipfile from collections.abc import Iterable from pathlib import Path from typing import Any, Literal import numpy as np import pandas as pd +import yaml from pydantic import ( BaseModel, ConfigDict, @@ -725,6 +729,135 @@ def from_numpy(cls, filepath: Path | str): data = np.load(f, allow_pickle=True).item() return cls(**data) + def to_zip(self, filepath: Path | str): + """Serialize the model to a zip file with YAML, TSV, and numpy files.""" + filepath = Path(filepath) + filepath.parent.mkdir(parents=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = Path(temp_dir) + + def dataframe_representer(dumper, data: pd.DataFrame): + """Custom YAML representer for pandas DataFrames.""" + if data.empty: + return dumper.represent_scalar("!pandas.DataFrame", "") + + file_id = str(uuid.uuid4()) + file_path = temp_dir / f"{file_id}.tsv" + data.to_csv(file_path, sep="\t", index=True) + return dumper.represent_scalar( + "!pandas.DataFrame", file_path.name + ) + + def ndarray_representer(dumper, data): + """Custom YAML representer for numpy arrays.""" + file_id = str(uuid.uuid4()) + file_path = temp_dir / f"{file_id}.npy" + np.save(file_path, data, allow_pickle=False) + return dumper.represent_scalar( + "!numpy.ndarray", file_path.name + ) + + def series_representer(dumper, data): + """Custom YAML representer for pandas Series.""" + file_id = str(uuid.uuid4()) + file_path = temp_dir / f"{file_id}.tsv" + data.to_csv(file_path, sep="\t", index=True) + return dumper.represent_scalar( + "!pandas.Series", file_path.name + ) + + def float64_representer(dumper, data): + return dumper.represent_float(float(data)) + + def tuple_representer(dumper, data): + return dumper.represent_sequence("!tuple", data) + + yaml.add_representer( + np.float64, float64_representer, Dumper=yaml.SafeDumper + ) + yaml.add_representer( + pd.DataFrame, dataframe_representer, Dumper=yaml.SafeDumper + ) + yaml.add_representer( + np.ndarray, ndarray_representer, Dumper=yaml.SafeDumper + ) + yaml.add_representer( + pd.Series, series_representer, Dumper=yaml.SafeDumper + ) + yaml.add_representer( + tuple, tuple_representer, Dumper=yaml.SafeDumper + ) + + with open(temp_dir / "session.yaml", "w") as f: + yaml.safe_dump(self.model_dump(), f) + + with zipfile.ZipFile( + filepath, "w", compression=zipfile.ZIP_DEFLATED + ) as zipf: + for item in temp_dir.iterdir(): + zipf.write(item, item.name) + + @classmethod + def from_zip(cls, filepath: Path | str): + """Deserialize the model from a zip file with YAML and TSV files.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = Path(temp_dir) + + with zipfile.ZipFile(filepath, "r") as zipf: + zipf.extractall(temp_dir) + + def dataframe_constructor(loader, node): + """Custom YAML constructor for pandas DataFrames.""" + if not (filename := loader.construct_scalar(node)): + return pd.DataFrame() + + file_path = temp_dir / filename + try: + return pd.read_csv(file_path, sep="\t", index_col=0) + except pd.errors.EmptyDataError: + return pd.DataFrame() + + def ndarray_constructor(loader, node): + """Custom YAML constructor for numpy arrays.""" + file_path = temp_dir / loader.construct_scalar(node) + return np.load(file_path, allow_pickle=False) + + def series_constructor(loader, node): + """Custom YAML constructor for pandas Series.""" + file_path = temp_dir / loader.construct_scalar(node) + df = pd.read_csv( + file_path, + sep="\t", + index_col=0, + header=0, + float_precision="round_trip", + ) + assert df.shape[1] == 1 + return df.iloc[:, 0] + + def tuple_constructor(loader, node): + return tuple(loader.construct_sequence(node)) + + yaml.add_constructor( + "!pandas.DataFrame", + dataframe_constructor, + Loader=yaml.SafeLoader, + ) + yaml.add_constructor( + "!numpy.ndarray", ndarray_constructor, Loader=yaml.SafeLoader + ) + yaml.add_constructor( + "!pandas.Series", series_constructor, Loader=yaml.SafeLoader + ) + yaml.add_constructor( + "!tuple", tuple_constructor, Loader=yaml.SafeLoader + ) + + with open(temp_dir / "session.yaml") as f: + data = yaml.safe_load(f) + return cls(**data) + def write_global_changes_reports( comparison: dict[tuple[ConditionId, ConditionId], ComparisonModel], diff --git a/src/ccompass/main_gui.py b/src/ccompass/main_gui.py index 80eedc7..ae8a9f4 100644 --- a/src/ccompass/main_gui.py +++ b/src/ccompass/main_gui.py @@ -1583,7 +1583,10 @@ def _handle_session_open(self): "Open Session", initial_folder=str(self.app_settings.last_session_dir), no_window=True, - file_types=(("Numpy", "*.npy"),), + file_types=( + ("Numpy", "*.npy"), + ("C-COMPASS zip", "*.ccompass"), + ), ) if not filename: return @@ -1621,7 +1624,10 @@ def _handle_session_save(self): filename = sg.popup_get_file( "Save Session", no_window=True, - file_types=(("Numpy", "*.npy"),), + file_types=( + ("Numpy", "*.npy"), + ("C-COMPASS zip", "*.ccompass"), + ), save_as=True, initial_folder=str(self.app_settings.last_session_dir), ) @@ -1633,7 +1639,10 @@ def _handle_session_save(self): self.app_settings.save() with wait_cursor(self.main_window): - self.model.to_numpy(filename) + if str(filename).endswith(".ccompass"): + self.model.to_zip(filename) + else: + self.model.to_numpy(filename) self._update_recent_files() @@ -2250,7 +2259,10 @@ def marker_setclass(values, marker_sets): def session_open(window: sg.Window, filename: str, model: SessionModel): """Read session data from file and update the window.""" # Update session data - tmp_session = SessionModel.from_numpy(filename) + if filename.endswith(".ccompass"): + tmp_session = SessionModel.from_zip(filename) + else: + tmp_session = SessionModel.from_numpy(filename) model.reset(tmp_session) # update GUI diff --git a/tests/test_full_analysis.py b/tests/test_full_analysis.py index b30e78d..6c882c2 100644 --- a/tests/test_full_analysis.py +++ b/tests/test_full_analysis.py @@ -2,6 +2,7 @@ from pathlib import Path import numpy as np +from test_session_model import assert_session_equal from ccompass._testing.synthetic_data import ( SyntheticDataConfig, @@ -201,3 +202,13 @@ def test_full(): ... sess.to_numpy(Path(__file__).parent / "session_test_full.npy") + sess.to_zip(Path(__file__).parent / "session_test_full.ccompass") + + sess2 = SessionModel.from_numpy( + Path(__file__).parent / "session_test_full.npy" + ) + assert_session_equal(sess, sess2) + sess2 = SessionModel.from_zip( + Path(__file__).parent / "session_test_full.ccompass" + ) + assert_session_equal(sess, sess2) diff --git a/tests/test_session_model.py b/tests/test_session_model.py index a81bc4a..ec7ea08 100644 --- a/tests/test_session_model.py +++ b/tests/test_session_model.py @@ -3,7 +3,9 @@ from pathlib import Path from tempfile import TemporaryDirectory +import numpy as np import pandas as pd +import pydantic from ccompass.main_gui import SessionModel @@ -20,8 +22,13 @@ def test_serialization(): assert_session_equal(session, session2) -def assert_equal(obj1, obj2): +def assert_equal(obj1, obj2) -> None: """Check if two objects are equal.""" + if isinstance(obj1, pydantic.BaseModel): + assert isinstance(obj2, pydantic.BaseModel) + assert_equal(obj1.model_dump(), obj2.model_dump()) + return + if isinstance(obj1, dict): for key in obj1: assert key in obj2 @@ -30,7 +37,20 @@ def assert_equal(obj1, obj2): for i in range(len(obj1)): assert_equal(obj1[i], obj2[i]) elif isinstance(obj1, pd.DataFrame): - pd.testing.assert_frame_equal(obj1, obj2) + assert isinstance(obj2, pd.DataFrame) + if obj1.empty and obj2.empty: + # if both are empty, we don't compare columns + return + pd.testing.assert_frame_equal(obj1, obj2, check_dtype=False) + elif isinstance(obj1, pd.Series): + assert isinstance(obj2, pd.Series), f"{obj1} != {obj2}" + pd.testing.assert_series_equal( + obj1, obj2, atol=1e-14, rtol=1e-14, check_dtype=False + ) + elif isinstance(obj1, np.ndarray): + np.testing.assert_almost_equal(obj1, obj2) + elif isinstance(obj1, float) and pd.isna(obj1): + assert pd.isna(obj2) else: assert obj1 == obj2 @@ -42,3 +62,16 @@ def assert_session_equal(session, session2): assert_equal(getattr(session, attr), getattr(session2, attr)) for attr in session2.__dict__: assert attr in session.__dict__ + + +def test_serialize_zip(): + """Test serialization of SessionModel to zip.""" + session = SessionModel() + + # round trip + with TemporaryDirectory() as tempdir: + fpath = Path(tempdir, "session.zip") + session.to_zip(fpath) + session2 = SessionModel.from_zip(fpath) + + assert_session_equal(session, session2)