diff --git a/doc/index.rst b/doc/index.rst index 0311c7a0..6a18ea02 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -3,6 +3,13 @@ AFQ-Insight: Statistical learning for tractometry data AFQ-Insight is a Python library for statistical learning of tractometry data. +Tractometry assesses the tissue properties of the major white matter connections +between different brain regions. AFQ-Insight inter-operates with the results of +tractometry produced by the `pyAFQ `_ +software library. However, you can also use the output of other tractometry +pipelines if you convert them into the format produced by pyAFQ. + + .. toctree:: :maxdepth: 3 :hidden: @@ -46,10 +53,12 @@ reference and a bibtex entry. Acknowledgements ---------------- -*AFQ-Insight* development is supported through a grant from the `Gordon and Betty +*AFQ-Insight* development was supported through a grant from the `Gordon and Betty Moore Foundation `_ and from the `Alfred P. Sloan -Foundation `_ to the `University of Washington eScience -Institute `_, as well as NIH Collaborative -Research in Computational Neuroscience grant R01EB027585-01 through the National -Institute of Biomedical Imaging and Bioengineering to Eleftherios Garyfallidis -(Indiana University) and Ariel Rokem (University of Washington). +Foundation `_ to the +`University of Washington eScience Institute `_, +NIH Collaborative Research in Computational Neuroscience grant R01EB027585-01 +through the National Institute of Biomedical Imaging and Bioengineering to +Eleftherios Garyfallidis (Indiana University) and Ariel Rokem (University of +Washington) and NIH grant RF1MH121868 (PIs: Ariel Rokem, Jason Yeatman and Noah +Simon) from the National Institute of Mental Health and the BRAIN Initiative Informatics program. diff --git a/examples/plot_als_classification.py b/examples/plot_als_classification.py index 2b7faeb5..34c7dc20 100644 --- a/examples/plot_als_classification.py +++ b/examples/plot_als_classification.py @@ -27,9 +27,8 @@ """ import matplotlib.pyplot as plt import numpy as np -import os.path as op -from afqinsight.datasets import download_sarica, load_afq_data +from afqinsight import AFQDataset from afqinsight import make_afq_classifier_pipeline from groupyr.decomposition import GroupPCA @@ -37,18 +36,24 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import cross_validate -workdir = download_sarica() -afqdata = load_afq_data( - fn_nodes=op.join(workdir, "nodes.csv"), - fn_subjects=op.join(workdir, "subjects.csv"), - dwi_metrics=["md", "fa"], - target_cols=["class"], - label_encode_cols=["class"], -) +############################################################################# +# Fetch data from Sarica et al. +# ----------------------------- +# As a shortcut, we have incorporated a few studies into the software. In these +# cases, a :class:`AFQDataset` class instance can be initialized using the +# :func:`AFQDataset.from_study` static method. This expects the name of one of +# the studies that are supported (see the method documentation for the list of +# these studies). By passing `"sarica"`, we request that the software download +# the data from this study and initialize an object for us from this data. + + +afqdata = AFQDataset.from_study("sarica") + +# Examine the data +# ---------------- +# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the tractometry features and phenotypic targets. -# afqdata is a namedtuple. You can access it's fields using dot notation or by -# unpacking the tuple. To see all of the available fields use `afqdata._fields` X = afqdata.X y = afqdata.y groups = afqdata.groups @@ -56,8 +61,14 @@ group_names = afqdata.group_names subjects = afqdata.subjects -# Here we reduce computation time by taking the first 10 principal components of each feature group and performing SGL logistic regression on those components. -# If you want to train an SGL model without group PCA, set ``do_group_pca = False``. This will increase the number of features by an order of magnitude and slow down execution time. +# Reduce data dimensionality +# -------------------------- +# Here we reduce computation time by taking the first 10 principal components of +# each feature group and performing SGL logistic regression on those components. +# If you want to train an SGL model without group PCA, set ``do_group_pca = +# False``. This will increase the number of features by an order of magnitude +# and slow down execution time. + do_group_pca = True if do_group_pca: @@ -76,6 +87,13 @@ transformer = False transformer_kwargs = None + +# Create the classification pipeline +# ---------------------------------- +# The core computational machinery is a pipeline. These operate as scikit-learn +# compatible pipelines, so we can pass them to scikit-learn functions. +# There are many options that need to be set to configure the pipeline object. + pipe = make_afq_classifier_pipeline( imputer_kwargs={"strategy": "median"}, # Use median imputation use_cv_estimator=True, # Automatically determine the best hyperparameters @@ -95,11 +113,18 @@ tol=1e-2, # Set a lenient convergence tolerance just for this example ) -# ``pipe`` is a scikit-learn pipeline and can be used in other scikit-learn functions +# Fit and cross-validate +# ---------------------- +# The ``pipe`` object is a scikit-learn pipeline and can be used in other +# scikit-learn functions + scores = cross_validate( pipe, X, y, cv=5, return_train_score=True, return_estimator=True ) +# Display results +# --------------- + print(f"Mean train score: {np.mean(scores['train_score']):5.3f}") print(f"Mean test score: {np.mean(scores['test_score']):5.3f}") print(f"Mean fit time: {np.mean(scores['fit_time']):5.2f}s") diff --git a/examples/plot_hbn_site_profiles.py b/examples/plot_hbn_site_profiles.py index 3d816b08..fb7d3366 100644 --- a/examples/plot_hbn_site_profiles.py +++ b/examples/plot_hbn_site_profiles.py @@ -44,32 +44,30 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split -########################################################################## +############################################################################# # Fetch the HBN data # ------------------ -# -# The :func:`AFQDataset.from_files` static method expects a path to -# nodes.csv and subjects.csv files, but these file paths can be remote -# URLs or AWS S3 URIs. We'll use S3 URIs to grab the HBN data. After dropping -# participants with null phenotypic values, it has 1,867 participants. - -dataset = AFQDataset.from_files( - fn_nodes="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/afq/combined_tract_profiles.csv", - fn_subjects="s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/qsiprep/participants.tsv", - dwi_metrics=["dki_fa", "dki_md"], - target_cols=["age", "sex", "scan_site_id"], - label_encode_cols=["sex", "scan_site_id"], - index_col="subject_id", -) +# As a shortcut, we have incorporated a few studies into the software. In these +# cases, a :class:`AFQDataset` class instance can be initialized using the +# :func:`AFQDataset.from_study` static method. This expects the name of one of +# the studies that are supported (see the method documentation for the list of +# these studies). By passing `"hbn"`, we request that the object download the +# HBN dataset from the AWS Open Data program where it has been stored and +# initialize the objects with the subjects and nodes information. Subjects' age +# is set as the target variable. After dropping subjects that don't have their +# age recorded, there are 1867 subjects in the dataset. + + +dataset = AFQDataset.from_study("hbn") dataset.drop_target_na() print(dataset) -########################################################################## +############################################################################# # Train / test split # ------------------ # -# We can use the dataset in the :func:`train_test_split` function just as we -# would with an array. +# We can pass the :class:`AFQDataset` class instance to scikit-learn's +# :func:`train_test_split` function, just as we would with an array. dataset_train, dataset_test = train_test_split(dataset, test_size=0.5) @@ -111,7 +109,7 @@ # # N.B. We use the excellent `neurocombat_sklearn # `_ package to apply ComBat to -# our data. We love this library, however it is not fully compliant with the +# our data. We love this library, however, it is not fully compliant with the # scikit-learn transformer API, so we cannot use the # :func:`AFQDataset.model_fit_transform` method to apply this transformer to our # dataset. No problem! We can simply copy the unharmonized dataset into a new