From bcc7d0b68e04691fb3fc9c608798d30b60e9248a Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Thu, 23 Jun 2022 23:45:42 +0100
Subject: [PATCH 1/8] DOC: Use the new from_dataset method to initialize object
 with HBN.

Follow up on #121.
---
 examples/plot_hbn_site_profiles.py | 38 ++++++++++++++----------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py
index 3d816b08..abd0900c 100644
--- a/examples/plot_hbn_site_profiles.py
+++ b/examples/plot_hbn_site_profiles.py
@@ -44,32 +44,30 @@
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 
-##########################################################################
-# Fetch the HBN data
+########################################################################## Fetch
+# the HBN data
 # ------------------
 #
-# The :func:`AFQDataset.from_files` static method expects a path to
-# nodes.csv and subjects.csv files, but these file paths can be remote
-# URLs or AWS S3 URIs. We'll use S3 URIs to grab the HBN data. After dropping
-# participants with null phenotypic values, it has 1,867 participants.
-
-dataset = AFQDataset.from_files(
-    fn_nodes="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/afq/combined_tract_profiles.csv",
-    fn_subjects="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/qsiprep/participants.tsv",
-    dwi_metrics=["dki_fa", "dki_md"],
-    target_cols=["age", "sex", "scan_site_id"],
-    label_encode_cols=["sex", "scan_site_id"],
-    index_col="subject_id",
-)
+# For specific cases, an :class:`AFQDataset` class instance can be initialized
+# using the :func:`AFQDataset.from_study` static method. This expects the name
+# of one of the datasets supported (see the method documentation for the list of
+# these datasets). By passing `"hbn"`, we request that the object download the
+# HBN dataset from the AWS Open Data program where it has been stored and
+# initialize the objects with the subjects and nodes information. Subjects' age
+# is set as the target variable. After dropping subjects that don't have their
+# age recorded, there are 1867 subjects in the dataset.
+
+
+dataset = AFQDataset.from_study("hbn")
 dataset.drop_target_na()
 print(dataset)
 
-##########################################################################
-# Train / test split
+########################################################################## Train
+# / test split
 # ------------------
 #
-# We can use the dataset in the :func:`train_test_split` function just as we
-# would with an array.
+# We can pass the :class:`AFQDataset` class instance to Sckit Learn's
+# :func:`train_test_split` function, just as we would with an array.
 
 dataset_train, dataset_test = train_test_split(dataset, test_size=0.5)
 
@@ -111,7 +109,7 @@
 #
 # N.B. We use the excellent `neurocombat_sklearn
 # <https://github.com/Warvito/neurocombat_sklearn>`_ package to apply ComBat to
-# our data. We love this library, however it is not fully compliant with the
+# our data. We love this library, however, it is not fully compliant with the
 # scikit-learn transformer API, so we cannot use the
 # :func:`AFQDataset.model_fit_transform` method to apply this transformer to our
 # dataset. No problem! We can simply copy the unharmonized dataset into a new

From b9ed97e28118e18e7ca9529918826b202383f6ca Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 13:17:07 -0700
Subject: [PATCH 2/8] Use the same reference to sklearn as in the sklearn docs

Co-authored-by: Adam Richie-Halford <richford@users.noreply.github.com>
---
 examples/plot_hbn_site_profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py
index abd0900c..9651f8b3 100644
--- a/examples/plot_hbn_site_profiles.py
+++ b/examples/plot_hbn_site_profiles.py
@@ -66,7 +66,7 @@
 # / test split
 # ------------------
 #
-# We can pass the :class:`AFQDataset` class instance to Sckit Learn's
+# We can pass the :class:`AFQDataset` class instance to scikit-learn's
 # :func:`train_test_split` function, just as we would with an array.
 
 dataset_train, dataset_test = train_test_split(dataset, test_size=0.5)

From 900f0bb116939749a1ce3b8ff01db7dc6b72124c Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 13:24:11 -0700
Subject: [PATCH 3/8] Update documentation front page to include a link to
 pyAFQ.

Also, updated funding information.
---
 doc/index.rst | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/doc/index.rst b/doc/index.rst
index 0311c7a0..6a18ea02 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -3,6 +3,13 @@ AFQ-Insight: Statistical learning for tractometry data
 
 AFQ-Insight is a Python library for statistical learning of tractometry data.
 
+Tractometry assesses the tissue properties of the major white matter connections
+between different brain regions. AFQ-Insight inter-operates with the results of
+tractometry produced by the `pyAFQ <https://yeatmanlab.github.io/pyAFQ>`_
+software library. However, you can also use the output of other tractometry
+pipelines if you convert them into the format produced by pyAFQ.
+
+
 .. toctree::
    :maxdepth: 3
    :hidden:
@@ -46,10 +53,12 @@ reference and a bibtex entry.
 Acknowledgements
 ----------------
 
-*AFQ-Insight* development is supported through a grant from the `Gordon and Betty
+*AFQ-Insight* development was supported through a grant from the `Gordon and Betty
 Moore Foundation <https://www.moore.org/>`_ and from the `Alfred P. Sloan
-Foundation <https://sloan.org/>`_ to the `University of Washington eScience
-Institute <http://escience.washington.edu/>`_, as well as NIH Collaborative
-Research in Computational Neuroscience grant R01EB027585-01 through the National
-Institute of Biomedical Imaging and Bioengineering to Eleftherios Garyfallidis
-(Indiana University) and Ariel Rokem (University of Washington).
+Foundation <https://sloan.org/>`_ to the
+`University of Washington eScience Institute <http://escience.washington.edu/>`_,
+NIH Collaborative Research in Computational Neuroscience grant R01EB027585-01
+through the National Institute of Biomedical Imaging and Bioengineering to
+Eleftherios Garyfallidis (Indiana University) and Ariel Rokem (University of
+Washington) and NIH grant RF1MH121868 (PIs: Ariel Rokem, Jason Yeatman and Noah
+Simon) from the National Institute of Mental Health and the BRAIN Initiative Informatics program.

From f76044a5254031b913104f45beacbc309aac7f61 Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 13:56:37 -0700
Subject: [PATCH 4/8] Fixes line breaks in example.

---
 examples/plot_hbn_site_profiles.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py
index 9651f8b3..3a5848ff 100644
--- a/examples/plot_hbn_site_profiles.py
+++ b/examples/plot_hbn_site_profiles.py
@@ -44,26 +44,26 @@
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 
-########################################################################## Fetch
-# the HBN data
+#############################################################################
+# Fetch the HBN data
 # ------------------
-#
-# For specific cases, an :class:`AFQDataset` class instance can be initialized
-# using the :func:`AFQDataset.from_study` static method. This expects the name
-# of one of the datasets supported (see the method documentation for the list of
-# these datasets). By passing `"hbn"`, we request that the object download the
-# HBN dataset from the AWS Open Data program where it has been stored and
-# initialize the objects with the subjects and nodes information. Subjects' age
-# is set as the target variable. After dropping subjects that don't have their
-# age recorded, there are 1867 subjects in the dataset.
+# For datasets that are part of our example suite, a :class:`AFQDataset` class
+# instance can be initialized using the :func:`AFQDataset.from_study` static
+# method. This expects the name of one of the datasets supported (see the method
+# documentation for the list of these datasets). By passing `"hbn"`, we request
+# that the object download the HBN dataset from the AWS Open Data program where
+# it has been stored and initialize the objects with the subjects and nodes
+# information. Subjects' age is set as the target variable. After dropping
+# subjects that don't have their age recorded, there are 1867 subjects in the
+# dataset.
 
 
 dataset = AFQDataset.from_study("hbn")
 dataset.drop_target_na()
 print(dataset)
 
-########################################################################## Train
-# / test split
+#############################################################################
+# Train / test split
 # ------------------
 #
 # We can pass the :class:`AFQDataset` class instance to scikit-learn's

From 744f88c22801390adf3b1a80d433886107057806 Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 14:56:57 -0700
Subject: [PATCH 5/8] Refactors ALS classificaation example to use
 from_dataaset.

---
 examples/plot_als_classification.py | 51 ++++++++++++++++++++++-------
 examples/plot_hbn_site_profiles.py  | 18 +++++-----
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py
index 2b7faeb5..99c7e326 100644
--- a/examples/plot_als_classification.py
+++ b/examples/plot_als_classification.py
@@ -37,18 +37,25 @@
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import cross_validate
 
-workdir = download_sarica()
-
-afqdata = load_afq_data(
-    fn_nodes=op.join(workdir, "nodes.csv"),
-    fn_subjects=op.join(workdir, "subjects.csv"),
-    dwi_metrics=["md", "fa"],
-    target_cols=["class"],
-    label_encode_cols=["class"],
-)
 
+#############################################################################
+# Fetch data from Sarica et al.
+# -----------------------------
+# As a shortcut, we have incorporated a few studies into the software. In these
+# cases, a :class:`AFQDataset` class instance can be initialized using the
+# :func:`AFQDataset.from_study` static method. This expects the name of one of
+# the studies that are supported (see the method documentation for the list of
+# these studies). By passing `"sarica"`, we request that the software download
+# the data from this study and initialize an object for us from this data.
+
+
+afqdata = AFQDataset.from_study("sarica")
+
+# Examine the data
+# ----------------
 # afqdata is a namedtuple. You can access it's fields using dot notation or by
 # unpacking the tuple. To see all of the available fields use `afqdata._fields`
+
 X = afqdata.X
 y = afqdata.y
 groups = afqdata.groups
@@ -56,8 +63,14 @@
 group_names = afqdata.group_names
 subjects = afqdata.subjects
 
-# Here we reduce computation time by taking the first 10 principal components of each feature group and performing SGL logistic regression on those components.
-# If you want to train an SGL model without group PCA, set ``do_group_pca = False``. This will increase the number of features by an order of magnitude and slow down execution time.
+# Reduce data dimensionality
+# --------------------------
+# Here we reduce computation time by taking the first 10 principal components of
+# each feature group and performing SGL logistic regression on those components.
+# If you want to train an SGL model without group PCA, set ``do_group_pca =
+# False``. This will increase the number of features by an order of magnitude
+# and slow down execution time.
+
 do_group_pca = True
 
 if do_group_pca:
@@ -76,6 +89,13 @@
     transformer = False
     transformer_kwargs = None
 
+
+# Create the classification pipeline
+# ----------------------------------
+# The core computational machinery is a pipeline. These operate as scikit-learn
+# compatible pipelines, so we can pass them to scikit-learn functions.
+# There are many options that need to be set to configure the pipeline object.
+
 pipe = make_afq_classifier_pipeline(
     imputer_kwargs={"strategy": "median"},  # Use median imputation
     use_cv_estimator=True,  # Automatically determine the best hyperparameters
@@ -95,11 +115,18 @@
     tol=1e-2,  # Set a lenient convergence tolerance just for this example
 )
 
-# ``pipe`` is a scikit-learn pipeline and can be used in other scikit-learn functions
+# Fit and cross-validate
+# ----------------------
+# The ``pipe`` object is a scikit-learn pipeline and can be used in other
+# scikit-learn functions
+
 scores = cross_validate(
     pipe, X, y, cv=5, return_train_score=True, return_estimator=True
 )
 
+# Display results
+# ---------------
+
 print(f"Mean train score: {np.mean(scores['train_score']):5.3f}")
 print(f"Mean test score:  {np.mean(scores['test_score']):5.3f}")
 print(f"Mean fit time:    {np.mean(scores['fit_time']):5.2f}s")
diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py
index 3a5848ff..fb7d3366 100644
--- a/examples/plot_hbn_site_profiles.py
+++ b/examples/plot_hbn_site_profiles.py
@@ -47,15 +47,15 @@
 #############################################################################
 # Fetch the HBN data
 # ------------------
-# For datasets that are part of our example suite, a :class:`AFQDataset` class
-# instance can be initialized using the :func:`AFQDataset.from_study` static
-# method. This expects the name of one of the datasets supported (see the method
-# documentation for the list of these datasets). By passing `"hbn"`, we request
-# that the object download the HBN dataset from the AWS Open Data program where
-# it has been stored and initialize the objects with the subjects and nodes
-# information. Subjects' age is set as the target variable. After dropping
-# subjects that don't have their age recorded, there are 1867 subjects in the
-# dataset.
+# As a shortcut, we have incorporated a few studies  into the software. In these
+# cases, a :class:`AFQDataset` class instance can be initialized using the
+# :func:`AFQDataset.from_study` static method. This expects the name of one of
+# the studies that are supported (see the method documentation for the list of
+# these studies). By passing `"hbn"`, we request that the object download the
+# HBN dataset from the AWS Open Data program where it has been stored and
+# initialize the objects with the subjects and nodes information. Subjects' age
+# is set as the target variable. After dropping subjects that don't have their
+# age recorded, there are 1867 subjects in the dataset.
 
 
 dataset = AFQDataset.from_study("hbn")

From e3f85fb50e32986e082a5cd9a9c040c170720797 Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 19:58:04 -0700
Subject: [PATCH 6/8] Remove unused imports.

---
 examples/plot_als_classification.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py
index 99c7e326..2710a733 100644
--- a/examples/plot_als_classification.py
+++ b/examples/plot_als_classification.py
@@ -27,9 +27,7 @@
 """
 import matplotlib.pyplot as plt
 import numpy as np
-import os.path as op
 
-from afqinsight.datasets import download_sarica, load_afq_data
 from afqinsight import make_afq_classifier_pipeline
 
 from groupyr.decomposition import GroupPCA

From 2f5dee228073eaa09ab31fcd9cf9df045647620f Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 20:02:56 -0700
Subject: [PATCH 7/8] Import needed object.

---
 examples/plot_als_classification.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py
index 2710a733..ce6a3d63 100644
--- a/examples/plot_als_classification.py
+++ b/examples/plot_als_classification.py
@@ -28,6 +28,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
+from afqinsight import AFQDataset
 from afqinsight import make_afq_classifier_pipeline
 
 from groupyr.decomposition import GroupPCA

From 18df3f9a41a98251099f3145eb0acbcf2f1f7234 Mon Sep 17 00:00:00 2001
From: Ariel Rokem <arokem@gmail.com>
Date: Mon, 27 Jun 2022 20:50:13 -0700
Subject: [PATCH 8/8] Update description "namedtuple" => "AFQDataset"

Co-authored-by: Adam Richie-Halford <richford@users.noreply.github.com>
---
 examples/plot_als_classification.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py
index ce6a3d63..34c7dc20 100644
--- a/examples/plot_als_classification.py
+++ b/examples/plot_als_classification.py
@@ -52,8 +52,7 @@
 
 # Examine the data
 # ----------------
-# afqdata is a namedtuple. You can access it's fields using dot notation or by
-# unpacking the tuple. To see all of the available fields use `afqdata._fields`
+# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the tractometry features and phenotypic targets.
 
 X = afqdata.X
 y = afqdata.y