Merge pull request #123 from arokem/use_from_dataset

Use the new from_dataset method to initialize objects in examples.
yeatmanlab · Jun 28, 2022 · ec514b0 · ec514b0
2 parents 17db479 + 18df3f9
commit ec514b0
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 40 deletions.
diff --git a/doc/index.rst b/doc/index.rst
@@ -3,6 +3,13 @@ AFQ-Insight: Statistical learning for tractometry data
 
 AFQ-Insight is a Python library for statistical learning of tractometry data.
 
+Tractometry assesses the tissue properties of the major white matter connections
+between different brain regions. AFQ-Insight inter-operates with the results of
+tractometry produced by the `pyAFQ <https://yeatmanlab.github.io/pyAFQ>`_
+software library. However, you can also use the output of other tractometry
+pipelines if you convert them into the format produced by pyAFQ.
+
+
 .. toctree::
    :maxdepth: 3
    :hidden:
@@ -46,10 +53,12 @@ reference and a bibtex entry.
 Acknowledgements
 ----------------
 
-*AFQ-Insight* development is supported through a grant from the `Gordon and Betty
+*AFQ-Insight* development was supported through a grant from the `Gordon and Betty
 Moore Foundation <https://www.moore.org/>`_ and from the `Alfred P. Sloan
-Foundation <https://sloan.org/>`_ to the `University of Washington eScience
-Institute <http://escience.washington.edu/>`_, as well as NIH Collaborative
-Research in Computational Neuroscience grant R01EB027585-01 through the National
-Institute of Biomedical Imaging and Bioengineering to Eleftherios Garyfallidis
-(Indiana University) and Ariel Rokem (University of Washington).
+Foundation <https://sloan.org/>`_ to the
+`University of Washington eScience Institute <http://escience.washington.edu/>`_,
+NIH Collaborative Research in Computational Neuroscience grant R01EB027585-01
+through the National Institute of Biomedical Imaging and Bioengineering to
+Eleftherios Garyfallidis (Indiana University) and Ariel Rokem (University of
+Washington) and NIH grant RF1MH121868 (PIs: Ariel Rokem, Jason Yeatman and Noah
+Simon) from the National Institute of Mental Health and the BRAIN Initiative Informatics program.
diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py
@@ -27,37 +27,48 @@
 """
 import matplotlib.pyplot as plt
 import numpy as np
-import os.path as op
 
-from afqinsight.datasets import download_sarica, load_afq_data
+from afqinsight import AFQDataset
 from afqinsight import make_afq_classifier_pipeline
 
 from groupyr.decomposition import GroupPCA
 
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import cross_validate
 
-workdir = download_sarica()
 
-afqdata = load_afq_data(
-    fn_nodes=op.join(workdir, "nodes.csv"),
-    fn_subjects=op.join(workdir, "subjects.csv"),
-    dwi_metrics=["md", "fa"],
-    target_cols=["class"],
-    label_encode_cols=["class"],
-)
+#############################################################################
+# Fetch data from Sarica et al.
+# -----------------------------
+# As a shortcut, we have incorporated a few studies into the software. In these
+# cases, a :class:`AFQDataset` class instance can be initialized using the
+# :func:`AFQDataset.from_study` static method. This expects the name of one of
+# the studies that are supported (see the method documentation for the list of
+# these studies). By passing `"sarica"`, we request that the software download
+# the data from this study and initialize an object for us from this data.
+
+
+afqdata = AFQDataset.from_study("sarica")
+
+# Examine the data
+# ----------------
+# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the tractometry features and phenotypic targets.
 
-# afqdata is a namedtuple. You can access it's fields using dot notation or by
-# unpacking the tuple. To see all of the available fields use `afqdata._fields`
 X = afqdata.X
 y = afqdata.y
 groups = afqdata.groups
 feature_names = afqdata.feature_names
 group_names = afqdata.group_names
 subjects = afqdata.subjects
 
-# Here we reduce computation time by taking the first 10 principal components of each feature group and performing SGL logistic regression on those components.
-# If you want to train an SGL model without group PCA, set ``do_group_pca = False``. This will increase the number of features by an order of magnitude and slow down execution time.
+# Reduce data dimensionality
+# --------------------------
+# Here we reduce computation time by taking the first 10 principal components of
+# each feature group and performing SGL logistic regression on those components.
+# If you want to train an SGL model without group PCA, set ``do_group_pca =
+# False``. This will increase the number of features by an order of magnitude
+# and slow down execution time.
+
 do_group_pca = True
 
 if do_group_pca:
@@ -76,6 +87,13 @@
     transformer = False
     transformer_kwargs = None
 
+
+# Create the classification pipeline
+# ----------------------------------
+# The core computational machinery is a pipeline. These operate as scikit-learn
+# compatible pipelines, so we can pass them to scikit-learn functions.
+# There are many options that need to be set to configure the pipeline object.
+
 pipe = make_afq_classifier_pipeline(
     imputer_kwargs={"strategy": "median"},  # Use median imputation
     use_cv_estimator=True,  # Automatically determine the best hyperparameters
@@ -95,11 +113,18 @@
     tol=1e-2,  # Set a lenient convergence tolerance just for this example
 )
 
-# ``pipe`` is a scikit-learn pipeline and can be used in other scikit-learn functions
+# Fit and cross-validate
+# ----------------------
+# The ``pipe`` object is a scikit-learn pipeline and can be used in other
+# scikit-learn functions
+
 scores = cross_validate(
     pipe, X, y, cv=5, return_train_score=True, return_estimator=True
 )
 
+# Display results
+# ---------------
+
 print(f"Mean train score: {np.mean(scores['train_score']):5.3f}")
 print(f"Mean test score:  {np.mean(scores['test_score']):5.3f}")
 print(f"Mean fit time:    {np.mean(scores['fit_time']):5.2f}s")

diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py
@@ -44,32 +44,30 @@
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 
-##########################################################################
+#############################################################################
 # Fetch the HBN data
 # ------------------
-#
-# The :func:`AFQDataset.from_files` static method expects a path to
-# nodes.csv and subjects.csv files, but these file paths can be remote
-# URLs or AWS S3 URIs. We'll use S3 URIs to grab the HBN data. After dropping
-# participants with null phenotypic values, it has 1,867 participants.
-
-dataset = AFQDataset.from_files(
-    fn_nodes="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/afq/combined_tract_profiles.csv",
-    fn_subjects="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/qsiprep/participants.tsv",
-    dwi_metrics=["dki_fa", "dki_md"],
-    target_cols=["age", "sex", "scan_site_id"],
-    label_encode_cols=["sex", "scan_site_id"],
-    index_col="subject_id",
-)
+# As a shortcut, we have incorporated a few studies  into the software. In these
+# cases, a :class:`AFQDataset` class instance can be initialized using the
+# :func:`AFQDataset.from_study` static method. This expects the name of one of
+# the studies that are supported (see the method documentation for the list of
+# these studies). By passing `"hbn"`, we request that the object download the
+# HBN dataset from the AWS Open Data program where it has been stored and
+# initialize the objects with the subjects and nodes information. Subjects' age
+# is set as the target variable. After dropping subjects that don't have their
+# age recorded, there are 1867 subjects in the dataset.
+
+
+dataset = AFQDataset.from_study("hbn")
 dataset.drop_target_na()
 print(dataset)
 
-##########################################################################
+#############################################################################
 # Train / test split
 # ------------------
 #
-# We can use the dataset in the :func:`train_test_split` function just as we
-# would with an array.
+# We can pass the :class:`AFQDataset` class instance to scikit-learn's
+# :func:`train_test_split` function, just as we would with an array.
 
 dataset_train, dataset_test = train_test_split(dataset, test_size=0.5)
 
@@ -111,7 +109,7 @@
 #
 # N.B. We use the excellent `neurocombat_sklearn
 # <https://github.com/Warvito/neurocombat_sklearn>`_ package to apply ComBat to
-# our data. We love this library, however it is not fully compliant with the
+# our data. We love this library, however, it is not fully compliant with the
 # scikit-learn transformer API, so we cannot use the
 # :func:`AFQDataset.model_fit_transform` method to apply this transformer to our
 # dataset. No problem! We can simply copy the unharmonized dataset into a new