yeatmanlab · arokem · Jun 23, 2022 · Jun 18, 2022 · Jun 19, 2022 · Jun 19, 2022
diff --git a/.zenodo.json b/.zenodo.json
@@ -15,6 +15,16 @@
       "affiliation": "The University of Washington",
       "name": "Rokem, Ariel",
       "orcid": "0000-0003-0679-1985"
+    },
+    {
+      "affiliation": "University of Washington",
+      "name": "Kruper, John",
+      "orcid": "0000-0003-0081-391X"
+    },
+    {
+      "affiliation": "Stanford University",
+      "name": "Yeatman, Jason",
+      "orcid": "0000-0002-2686-1293"
     }
   ],
   "description": "<p>AFQ-Insight is a Python library for statistical learning with tractometry data.</p>",

diff --git a/afqinsight/cross_validate.py b/afqinsight/cross_validate.py
@@ -348,6 +348,7 @@ def cross_validate_checkpoint(
 
     Examples
     --------
+    >>> import numpy as np
     >>> import shutil
     >>> import tempfile
     >>> from sklearn import datasets, linear_model
@@ -364,8 +365,8 @@ def cross_validate_checkpoint(
     >>> cv_results = cross_validate_checkpoint(lasso, X, y, cv=3, checkpoint=False)
     >>> sorted(cv_results.keys())
     ['fit_time', 'score_time', 'test_score']
-    >>> cv_results['test_score']
-    array([0.33150734, 0.08022311, 0.03531764])
+    >>> cv_results['test_score']  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    array([0.33150..., 0.08022..., 0.03531...])
 
     Multiple metric evaluation using ``cross_validate``, an estimator
     pipeline, and checkpointing (please refer the ``scoring`` parameter doc

diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py
@@ -635,6 +635,26 @@ def shape(self):
         else:
             return self.X.shape
 
+    def copy(self):
+        """Return a deep copy of this dataset.
+
+        Returns
+        -------
+        AFQDataset
+            A deep copy of this dataset
+        """
+        return AFQDataset(
+            X=self.X,
+            y=self.y,
+            groups=self.groups,
+            feature_names=self.feature_names,
+            target_cols=self.target_cols,
+            group_names=self.group_names,
+            subjects=self.subjects,
+            sessions=self.sessions,
+            classes=self.classes,
+        )
+
     def bundle_means(self):
         """Return diffusion metrics averaged along the length of each bundle.
 
@@ -749,6 +769,116 @@ def as_tensorflow_dataset(self, bundles_as_channels=True, channels_last=True):
         else:
             return tf.data.Dataset.from_tensor_slices((X, self.y.astype(float)))
 
+    def model_fit(self, model, **fit_params):
+        """Fit the dataset with a provided model object.
+
+        Parameters
+        ----------
+        model : sklearn model
+            The estimator or transformer to fit
+
+        **fit_params : dict
+            Additional parameters to pass to the fit method
+
+        Returns
+        -------
+        model : object
+            The fitted model
+        """
+        return model.fit(X=self.X, y=self.y, **fit_params)
+
+    def model_fit_transform(self, model, **fit_params):
+        """Fit and transform the dataset with a provided model object.
+
+        Parameters
+        ----------
+        model : sklearn model
+            The estimator or transformer to fit
+
+        **fit_params : dict
+            Additional parameters to pass to the fit_transform method
+
+        Returns
+        -------
+        dataset_new : AFQDataset
+            New AFQDataset with transformed features
+        """
+        return AFQDataset(
+            X=model.fit_transform(X=self.X, y=self.y, **fit_params),
+            y=self.y,
+            groups=self.groups,
+            feature_names=self.feature_names,
+            target_cols=self.target_cols,
+            group_names=self.group_names,
+            subjects=self.subjects,
+            sessions=self.sessions,
+            classes=self.classes,
+        )
+
+    def model_transform(self, model, **transform_params):
+        """Transform the dataset with a provided model object.
+
+        Parameters
+        ----------
+        model : sklearn model
+            The estimator or transformer to use to transform the features
+
+        **transform_params : dict
+            Additional parameters to pass to the transform method
+
+        Returns
+        -------
+        dataset_new : AFQDataset
+            New AFQDataset with transformed features
+        """
+        return AFQDataset(
+            X=model.transform(X=self.X, **transform_params),
+            y=self.y,
+            groups=self.groups,
+            feature_names=self.feature_names,
+            target_cols=self.target_cols,
+            group_names=self.group_names,
+            subjects=self.subjects,
+            sessions=self.sessions,
+            classes=self.classes,
+        )
+
+    def model_predict(self, model, **predict_params):
+        """Predict the targets with a provided model object.
+
+        Parameters
+        ----------
+        model : sklearn model
+            The estimator or transformer to use to predict the targets
+
+        **predict_params : dict
+            Additional parameters to pass to the predict method
+
+        Returns
+        -------
+        y_pred : ndarray
+            Predicted targets
+        """
+        return model.predict(X=self.X, **predict_params)
+
+    def model_score(self, model, **score_params):
+        """Score a model on this dataset.
+
+        Parameters
+        ----------
+        model : sklearn model
+            The estimator or transformer to use to score the model
+
+        **score_params : dict
+            Additional parameters to pass to the `score` method, e.g., `sample_weight`
+
+        Returns
+        -------
+        score : float
+            The score of the model (e.g. R2, accuracy, etc.)
+        """
+        return model.score(X=self.X, y=self.y, **score_params)
+
 
 def _download_url_to_file(url, output_fn, encoding="utf-8", verbose=True):
     fn_abs = op.abspath(output_fn)

diff --git a/afqinsight/tests/test_datasets.py b/afqinsight/tests/test_datasets.py
@@ -14,6 +14,8 @@
     AFQDataset,
     standardize_subject_id,
 )
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import Lasso
 
 data_path = op.join(afqi.__path__[0], "data")
 test_data_path = op.join(data_path, "test_data")
@@ -163,6 +165,69 @@ def test_AFQDataset_shape_len_index():
     assert repr(dataset) == "AFQDataset(n_samples=10, n_features=4)"  # nosec
 
 
+def test_AFQDataset_fit_transform():
+    sarica_dir = download_sarica()
+    dataset = AFQDataset.from_files(
+        fn_nodes=op.join(sarica_dir, "nodes.csv"),
+        fn_subjects=op.join(sarica_dir, "subjects.csv"),
+        dwi_metrics=["md", "fa"],
+        target_cols=["class"],
+        label_encode_cols=["class"],
+    )
+
+    # Test that model_fit fits the imputer
+    imputer = dataset.model_fit(SimpleImputer())
+    assert np.allclose(imputer.statistics_, np.nanmean(dataset.X, axis=0))
+
+    # Test that model_transform imputes the data
+    dataset_imputed = dataset.model_transform(imputer)
+    assert np.allclose(dataset_imputed.X, imputer.transform(dataset.X))
+
+    # Test that fit_transform does the same as fit and then transform
+    dataset_transformed = dataset.model_fit_transform(SimpleImputer())
+    assert np.allclose(dataset_transformed.X, dataset_imputed.X)
+
+
+def test_AFQDataset_copy():
+    wh_dir = download_weston_havens()
+    dataset_1 = AFQDataset.from_files(
+        fn_nodes=op.join(wh_dir, "nodes.csv"),
+        fn_subjects=op.join(wh_dir, "subjects.csv"),
+        dwi_metrics=["md", "fa"],
+        target_cols=["Age"],
+    )
+    dataset_2 = dataset_1.copy()
+
+    # Test that it copied
+    assert np.allclose(dataset_1.X, dataset_2.X, equal_nan=True)
+    assert dataset_1.groups == dataset_2.groups
+    assert dataset_1.group_names == dataset_2.group_names
+    assert dataset_1.subjects == dataset_2.subjects
+
+    # Test that it's a deep copy
+    dataset_1.X = np.zeros_like(dataset_2.X)
+    dataset_1.y = np.zeros_like(dataset_2.y)
+    assert not np.allclose(dataset_2.X, dataset_1.X, equal_nan=True)
+    assert not np.allclose(dataset_1.y, dataset_2.y, equal_nan=True)
+
+
+def test_AFQDataset_predict_score():
+    wh_dir = download_weston_havens()
+    dataset = AFQDataset.from_files(
+        fn_nodes=op.join(wh_dir, "nodes.csv"),
+        fn_subjects=op.join(wh_dir, "subjects.csv"),
+        dwi_metrics=["md", "fa"],
+        target_cols=["Age"],
+    )
+    dataset = dataset.model_fit_transform(SimpleImputer(strategy="median"))
+    estimator = dataset.model_fit(Lasso())
+    y_pred = dataset.model_predict(estimator)
+    assert np.allclose(estimator.predict(dataset.X), y_pred)
+    assert np.allclose(
+        estimator.score(dataset.X, dataset.y), dataset.model_score(estimator)
+    )
+
+
 def test_drop_target_na():
     dataset = AFQDataset(X=np.random.rand(10, 4), y=np.random.rand(10))
     dataset.y[:5] = np.nan

diff --git a/doc/api.rst b/doc/api.rst
@@ -4,6 +4,14 @@ API Reference
 
 .. currentmodule:: afqinsight
 
+Datasets
+========
+
+This class encapsulates an AFQ dataset and has static methods to read data from csv files
+conforming to the AFQ data standard.
+
+.. autoclass:: AFQDataset
+
 Pipelines
 =========
 
@@ -13,22 +21,6 @@ These are AFQ-Insights recommended estimator pipelines.
 
 .. autofunction:: make_afq_classifier_pipeline
 
-Cross Validation
-================
-
-This function validates model performance using cross-validation, while
-checkpointing the estimators and scores.
-
-.. autofunction:: cross_validate_checkpoint
-
-Dataset Loader
-==============
-
-This function reads data from csv files conforming to the AFQ data standard
-and return feature and target matrices, grouping arrays, and subject IDs.
-
-.. autofunction:: load_afq_data
-
 Transformers
 ============
 
@@ -37,3 +29,11 @@ data format to feature matrices that are ready for ingestion into
 sklearn-compatible pipelines.
 
 .. autoclass:: AFQDataFrameMapper
+
+Cross Validation
+================
+
+This function validates model performance using cross-validation, while
+checkpointing the estimators and scores.
+
+.. autofunction:: cross_validate_checkpoint
diff --git a/doc/conf.py b/doc/conf.py
@@ -70,7 +70,10 @@
 # source_encoding = 'utf-8-sig'
 
 # Generate the plots for the gallery
-plot_gallery = "True"
+plot_gallery = True
+gallery_conf = {
+    "filename_pattern": ["/plot", "/demo"],
+}
 
 # The master toctree document.
 master_doc = "index"