From bcc7d0b68e04691fb3fc9c608798d30b60e9248a Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 23 Jun 2022 23:45:42 +0100 Subject: [PATCH 1/8] DOC: Use the new from_dataset method to initialize object with HBN. Follow up on #121. --- examples/plot_hbn_site_profiles.py | 38 ++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py index 3d816b08..abd0900c 100644 --- a/examples/plot_hbn_site_profiles.py +++ b/examples/plot_hbn_site_profiles.py @@ -44,32 +44,30 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split -########################################################################## -# Fetch the HBN data +########################################################################## Fetch +# the HBN data # ------------------ # -# The :func:`AFQDataset.from_files` static method expects a path to -# nodes.csv and subjects.csv files, but these file paths can be remote -# URLs or AWS S3 URIs. We'll use S3 URIs to grab the HBN data. After dropping -# participants with null phenotypic values, it has 1,867 participants. - -dataset = AFQDataset.from_files( - fn_nodes="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/afq/combined_tract_profiles.csv", - fn_subjects="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/qsiprep/participants.tsv", - dwi_metrics=["dki_fa", "dki_md"], - target_cols=["age", "sex", "scan_site_id"], - label_encode_cols=["sex", "scan_site_id"], - index_col="subject_id", -) +# For specific cases, an :class:`AFQDataset` class instance can be initialized +# using the :func:`AFQDataset.from_study` static method. This expects the name +# of one of the datasets supported (see the method documentation for the list of +# these datasets). By passing `"hbn"`, we request that the object download the +# HBN dataset from the AWS Open Data program where it has been stored and +# initialize the objects with the subjects and nodes information. Subjects' age +# is set as the target variable. After dropping subjects that don't have their +# age recorded, there are 1867 subjects in the dataset. + + +dataset = AFQDataset.from_study("hbn") dataset.drop_target_na() print(dataset) -########################################################################## -# Train / test split +########################################################################## Train +# / test split # ------------------ # -# We can use the dataset in the :func:`train_test_split` function just as we -# would with an array. +# We can pass the :class:`AFQDataset` class instance to Sckit Learn's +# :func:`train_test_split` function, just as we would with an array. dataset_train, dataset_test = train_test_split(dataset, test_size=0.5) @@ -111,7 +109,7 @@ # # N.B. We use the excellent `neurocombat_sklearn # `_ package to apply ComBat to -# our data. We love this library, however it is not fully compliant with the +# our data. We love this library, however, it is not fully compliant with the # scikit-learn transformer API, so we cannot use the # :func:`AFQDataset.model_fit_transform` method to apply this transformer to our # dataset. No problem! We can simply copy the unharmonized dataset into a new From b9ed97e28118e18e7ca9529918826b202383f6ca Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 13:17:07 -0700 Subject: [PATCH 2/8] Use the same reference to sklearn as in the sklearn docs Co-authored-by: Adam Richie-Halford --- examples/plot_hbn_site_profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py index abd0900c..9651f8b3 100644 --- a/examples/plot_hbn_site_profiles.py +++ b/examples/plot_hbn_site_profiles.py @@ -66,7 +66,7 @@ # / test split # ------------------ # -# We can pass the :class:`AFQDataset` class instance to Sckit Learn's +# We can pass the :class:`AFQDataset` class instance to scikit-learn's # :func:`train_test_split` function, just as we would with an array. dataset_train, dataset_test = train_test_split(dataset, test_size=0.5) From 900f0bb116939749a1ce3b8ff01db7dc6b72124c Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 13:24:11 -0700 Subject: [PATCH 3/8] Update documentation front page to include a link to pyAFQ. Also, updated funding information. --- doc/index.rst | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index 0311c7a0..6a18ea02 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -3,6 +3,13 @@ AFQ-Insight: Statistical learning for tractometry data AFQ-Insight is a Python library for statistical learning of tractometry data. +Tractometry assesses the tissue properties of the major white matter connections +between different brain regions. AFQ-Insight inter-operates with the results of +tractometry produced by the `pyAFQ `_ +software library. However, you can also use the output of other tractometry +pipelines if you convert them into the format produced by pyAFQ. + + .. toctree:: :maxdepth: 3 :hidden: @@ -46,10 +53,12 @@ reference and a bibtex entry. Acknowledgements ---------------- -*AFQ-Insight* development is supported through a grant from the `Gordon and Betty +*AFQ-Insight* development was supported through a grant from the `Gordon and Betty Moore Foundation `_ and from the `Alfred P. Sloan -Foundation `_ to the `University of Washington eScience -Institute `_, as well as NIH Collaborative -Research in Computational Neuroscience grant R01EB027585-01 through the National -Institute of Biomedical Imaging and Bioengineering to Eleftherios Garyfallidis -(Indiana University) and Ariel Rokem (University of Washington). +Foundation `_ to the +`University of Washington eScience Institute `_, +NIH Collaborative Research in Computational Neuroscience grant R01EB027585-01 +through the National Institute of Biomedical Imaging and Bioengineering to +Eleftherios Garyfallidis (Indiana University) and Ariel Rokem (University of +Washington) and NIH grant RF1MH121868 (PIs: Ariel Rokem, Jason Yeatman and Noah +Simon) from the National Institute of Mental Health and the BRAIN Initiative Informatics program. From f76044a5254031b913104f45beacbc309aac7f61 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 13:56:37 -0700 Subject: [PATCH 4/8] Fixes line breaks in example. --- examples/plot_hbn_site_profiles.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py index 9651f8b3..3a5848ff 100644 --- a/examples/plot_hbn_site_profiles.py +++ b/examples/plot_hbn_site_profiles.py @@ -44,26 +44,26 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split -########################################################################## Fetch -# the HBN data +############################################################################# +# Fetch the HBN data # ------------------ -# -# For specific cases, an :class:`AFQDataset` class instance can be initialized -# using the :func:`AFQDataset.from_study` static method. This expects the name -# of one of the datasets supported (see the method documentation for the list of -# these datasets). By passing `"hbn"`, we request that the object download the -# HBN dataset from the AWS Open Data program where it has been stored and -# initialize the objects with the subjects and nodes information. Subjects' age -# is set as the target variable. After dropping subjects that don't have their -# age recorded, there are 1867 subjects in the dataset. +# For datasets that are part of our example suite, a :class:`AFQDataset` class +# instance can be initialized using the :func:`AFQDataset.from_study` static +# method. This expects the name of one of the datasets supported (see the method +# documentation for the list of these datasets). By passing `"hbn"`, we request +# that the object download the HBN dataset from the AWS Open Data program where +# it has been stored and initialize the objects with the subjects and nodes +# information. Subjects' age is set as the target variable. After dropping +# subjects that don't have their age recorded, there are 1867 subjects in the +# dataset. dataset = AFQDataset.from_study("hbn") dataset.drop_target_na() print(dataset) -########################################################################## Train -# / test split +############################################################################# +# Train / test split # ------------------ # # We can pass the :class:`AFQDataset` class instance to scikit-learn's From 744f88c22801390adf3b1a80d433886107057806 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 14:56:57 -0700 Subject: [PATCH 5/8] Refactors ALS classificaation example to use from_dataaset. --- examples/plot_als_classification.py | 51 ++++++++++++++++++++++------- examples/plot_hbn_site_profiles.py | 18 +++++----- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py index 2b7faeb5..99c7e326 100644 --- a/examples/plot_als_classification.py +++ b/examples/plot_als_classification.py @@ -37,18 +37,25 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import cross_validate -workdir = download_sarica() - -afqdata = load_afq_data( - fn_nodes=op.join(workdir, "nodes.csv"), - fn_subjects=op.join(workdir, "subjects.csv"), - dwi_metrics=["md", "fa"], - target_cols=["class"], - label_encode_cols=["class"], -) +############################################################################# +# Fetch data from Sarica et al. +# ----------------------------- +# As a shortcut, we have incorporated a few studies into the software. In these +# cases, a :class:`AFQDataset` class instance can be initialized using the +# :func:`AFQDataset.from_study` static method. This expects the name of one of +# the studies that are supported (see the method documentation for the list of +# these studies). By passing `"sarica"`, we request that the software download +# the data from this study and initialize an object for us from this data. + + +afqdata = AFQDataset.from_study("sarica") + +# Examine the data +# ---------------- # afqdata is a namedtuple. You can access it's fields using dot notation or by # unpacking the tuple. To see all of the available fields use `afqdata._fields` + X = afqdata.X y = afqdata.y groups = afqdata.groups @@ -56,8 +63,14 @@ group_names = afqdata.group_names subjects = afqdata.subjects -# Here we reduce computation time by taking the first 10 principal components of each feature group and performing SGL logistic regression on those components. -# If you want to train an SGL model without group PCA, set ``do_group_pca = False``. This will increase the number of features by an order of magnitude and slow down execution time. +# Reduce data dimensionality +# -------------------------- +# Here we reduce computation time by taking the first 10 principal components of +# each feature group and performing SGL logistic regression on those components. +# If you want to train an SGL model without group PCA, set ``do_group_pca = +# False``. This will increase the number of features by an order of magnitude +# and slow down execution time. + do_group_pca = True if do_group_pca: @@ -76,6 +89,13 @@ transformer = False transformer_kwargs = None + +# Create the classification pipeline +# ---------------------------------- +# The core computational machinery is a pipeline. These operate as scikit-learn +# compatible pipelines, so we can pass them to scikit-learn functions. +# There are many options that need to be set to configure the pipeline object. + pipe = make_afq_classifier_pipeline( imputer_kwargs={"strategy": "median"}, # Use median imputation use_cv_estimator=True, # Automatically determine the best hyperparameters @@ -95,11 +115,18 @@ tol=1e-2, # Set a lenient convergence tolerance just for this example ) -# ``pipe`` is a scikit-learn pipeline and can be used in other scikit-learn functions +# Fit and cross-validate +# ---------------------- +# The ``pipe`` object is a scikit-learn pipeline and can be used in other +# scikit-learn functions + scores = cross_validate( pipe, X, y, cv=5, return_train_score=True, return_estimator=True ) +# Display results +# --------------- + print(f"Mean train score: {np.mean(scores['train_score']):5.3f}") print(f"Mean test score: {np.mean(scores['test_score']):5.3f}") print(f"Mean fit time: {np.mean(scores['fit_time']):5.2f}s") diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py index 3a5848ff..fb7d3366 100644 --- a/examples/plot_hbn_site_profiles.py +++ b/examples/plot_hbn_site_profiles.py @@ -47,15 +47,15 @@ ############################################################################# # Fetch the HBN data # ------------------ -# For datasets that are part of our example suite, a :class:`AFQDataset` class -# instance can be initialized using the :func:`AFQDataset.from_study` static -# method. This expects the name of one of the datasets supported (see the method -# documentation for the list of these datasets). By passing `"hbn"`, we request -# that the object download the HBN dataset from the AWS Open Data program where -# it has been stored and initialize the objects with the subjects and nodes -# information. Subjects' age is set as the target variable. After dropping -# subjects that don't have their age recorded, there are 1867 subjects in the -# dataset. +# As a shortcut, we have incorporated a few studies into the software. In these +# cases, a :class:`AFQDataset` class instance can be initialized using the +# :func:`AFQDataset.from_study` static method. This expects the name of one of +# the studies that are supported (see the method documentation for the list of +# these studies). By passing `"hbn"`, we request that the object download the +# HBN dataset from the AWS Open Data program where it has been stored and +# initialize the objects with the subjects and nodes information. Subjects' age +# is set as the target variable. After dropping subjects that don't have their +# age recorded, there are 1867 subjects in the dataset. dataset = AFQDataset.from_study("hbn") From e3f85fb50e32986e082a5cd9a9c040c170720797 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 19:58:04 -0700 Subject: [PATCH 6/8] Remove unused imports. --- examples/plot_als_classification.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py index 99c7e326..2710a733 100644 --- a/examples/plot_als_classification.py +++ b/examples/plot_als_classification.py @@ -27,9 +27,7 @@ """ import matplotlib.pyplot as plt import numpy as np -import os.path as op -from afqinsight.datasets import download_sarica, load_afq_data from afqinsight import make_afq_classifier_pipeline from groupyr.decomposition import GroupPCA From 2f5dee228073eaa09ab31fcd9cf9df045647620f Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 20:02:56 -0700 Subject: [PATCH 7/8] Import needed object. --- examples/plot_als_classification.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py index 2710a733..ce6a3d63 100644 --- a/examples/plot_als_classification.py +++ b/examples/plot_als_classification.py @@ -28,6 +28,7 @@ import matplotlib.pyplot as plt import numpy as np +from afqinsight import AFQDataset from afqinsight import make_afq_classifier_pipeline from groupyr.decomposition import GroupPCA From 18df3f9a41a98251099f3145eb0acbcf2f1f7234 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 27 Jun 2022 20:50:13 -0700 Subject: [PATCH 8/8] Update description "namedtuple" => "AFQDataset" Co-authored-by: Adam Richie-Halford --- examples/plot_als_classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py index ce6a3d63..34c7dc20 100644 --- a/examples/plot_als_classification.py +++ b/examples/plot_als_classification.py @@ -52,8 +52,7 @@ # Examine the data # ---------------- -# afqdata is a namedtuple. You can access it's fields using dot notation or by -# unpacking the tuple. To see all of the available fields use `afqdata._fields` +# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the tractometry features and phenotypic targets. X = afqdata.X y = afqdata.y