diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b058c73de..66aa0f6ce 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 exclude: "^autometa/validation"
 repos:
   - repo: https://github.com/psf/black
-    rev: stable
+    rev: 21.5b2
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.1.0 # Use the ref you want to point at
+    rev: v4.0.1 # Use the ref you want to point at
     hooks:
       - id: trailing-whitespace
       - id: check-executables-have-shebangs
diff --git a/Dockerfile b/Dockerfile
index 4352d49a7..cafefbb28 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -57,4 +57,5 @@ RUN echo "Checking autometa entrypoints" \
     && autometa-taxonomy-majority-vote -h > /dev/null \
     && autometa-binning -h > /dev/null \
     && autometa-unclustered-recruitment -h > /dev/null \
-    && autometa-binning-summary -h > /dev/null
+    && autometa-binning-summary -h > /dev/null \
+    && autometa-download-dataset -h > /dev/null
diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py
index 1ad39ca46..cb4746bc8 100644
--- a/autometa/binning/summary.py
+++ b/autometa/binning/summary.py
@@ -323,7 +323,9 @@ def main():
     )
     # Now retrieve stats for each metabin
     metabin_stats_df = get_metabin_stats(
-        bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column,
+        bin_df=bin_df,
+        markers_fpath=args.markers,
+        cluster_col=args.binning_column,
     )
     metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True)
     logger.info(f"Wrote metabin stats to {args.output_stats}")
diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
index 5e5bed120..07cd5c1cf 100644
--- a/autometa/binning/unclustered_recruitment.py
+++ b/autometa/binning/unclustered_recruitment.py
@@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
         raise NotImplementedError(classifier)
 
     df = pd.DataFrame(
-        predictions, index=test_data.index, columns=train_data.target_names,
+        predictions,
+        index=test_data.index,
+        columns=train_data.target_names,
     )
     # Filter predictions by confidence threshold
     confidence_threshold = num_classifications * confidence
diff --git a/autometa/common/utilities.py b/autometa/common/utilities.py
index c183bf8d4..2cd0dce3b 100644
--- a/autometa/common/utilities.py
+++ b/autometa/common/utilities.py
@@ -29,6 +29,7 @@
 import logging
 import os
 import pickle
+import socket
 import sys
 import tarfile
 import time
@@ -444,6 +445,18 @@ def wrapper(*args, **kwds):
     return wrapper
 
 
+def internet_is_connected(
+    host: str = "8.8.8.8", port: int = 53, timeout: int = 2
+) -> bool:
+    # google.com
+    try:
+        socket.setdefaulttimeout(timeout)
+        socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
+        return True
+    except socket.error:
+        return False
+
+
 if __name__ == "__main__":
     print(
         "This file contains utilities for Autometa pipeline and should not be run directly!"
diff --git a/autometa/config/databases.py b/autometa/config/databases.py
index 5848db647..3f9d3771e 100644
--- a/autometa/config/databases.py
+++ b/autometa/config/databases.py
@@ -42,6 +42,7 @@
 from autometa.common.utilities import calc_checksum
 from autometa.common.utilities import read_checksum
 from autometa.common.utilities import write_checksum
+from autometa.common.utilities import internet_is_connected
 from autometa.common.exceptions import ChecksumMismatchError
 from autometa.common.external import diamond
 from autometa.common.external import hmmer
@@ -183,17 +184,6 @@ def satisfied(self, section: str = None, compare_checksums: bool = False) -> boo
             any_invalid = {}
         return not any_missing and not any_invalid
 
-    def internet_is_connected(
-        self, host: str = "8.8.8.8", port: int = 53, timeout: int = 2
-    ) -> bool:
-        # google.com
-        try:
-            socket.setdefaulttimeout(timeout)
-            socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
-            return True
-        except socket.error:
-            return False
-
     def get_remote_checksum(self, section: str, option: str) -> str:
         """Get the checksum from provided `section` respective to `option` in
         `self.config`.
@@ -226,7 +216,7 @@ def get_remote_checksum(self, section: str, option: str) -> str:
             raise ValueError(
                 f"'section' must be 'ncbi' or 'markers'. Provided: {section}"
             )
-        if not self.internet_is_connected():
+        if not internet_is_connected():
             raise ConnectionError("Cannot connect to the internet")
         if section == "ncbi":
             host = self.config.get(section, "host")
diff --git a/autometa/validation/__init__.py b/autometa/validation/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autometa/validation/datasets.py b/autometa/validation/datasets.py
new file mode 100755
index 000000000..69009618e
--- /dev/null
+++ b/autometa/validation/datasets.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+COPYRIGHT
+Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
+Shaurya Chanana, Izaak Miller, Jason C. Kwan
+
+This file is part of Autometa.
+
+Autometa is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Autometa is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with Autometa. If not, see <http://www.gnu.org/licenses/>.
+COPYRIGHT
+
+pulling data from google drive dataset with simulated or synthetic communities
+"""
+
+
+import gdown
+import os
+import sys
+import logging
+import pandas as pd
+
+from autometa.common.utilities import internet_is_connected
+
+logger = logging.getLogger(__name__)
+
+
+def download(
+    community_type: str, community_sizes: list, file_names: list, dir_path: str
+) -> None:
+
+    """Downloads the files specified in a dictionary.
+
+    Parameters
+    ----------
+    community_type : str
+        specifies the type of dataset that the user would like to download from
+    community_sizes : list
+        specifies the size of dataset that the user would like to download
+    file_names : list
+        specifies the file(s) that the user would like to download
+    dir_path : str
+        output path where the user wants to download the file(s)
+
+    Returns
+    -------
+    None
+        download is completed through gdown
+
+    """
+
+    if community_type == "synthetic" or community_type == "all":
+        raise NotImplementedError
+
+    # points to csv file on google drive
+    df = pd.read_csv(
+        "https://drive.google.com/uc?id=148fUO7jocoNOBUl2K4bCfjsbd42QxCzX",
+        dtype=str,
+        index_col=["dataset", "file"],
+    )
+
+    for community_size in community_sizes:
+        community_size_outdir = os.path.join(dir_path, community_size)
+        # make a new directory
+        if not os.path.exists(community_size_outdir):
+            os.makedirs(community_size_outdir)
+
+        for file_name in file_names:
+            file_id = df.loc[(community_size, file_name), "file_id"]
+            file_id_filepath = os.path.join(community_size_outdir, file_name)
+            url = f"https://drive.google.com/uc?id={file_id}"
+
+            gdown.download(url, file_id_filepath)
+
+
+def main():
+    import argparse
+    import logging as logger
+
+    logger.basicConfig(
+        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
+        datefmt="%m/%d/%Y %I:%M:%S %p",
+        level=logger.DEBUG,
+    )
+
+    parser = argparse.ArgumentParser(
+        description="Download a simulated community file from google drive to a specified output directory"
+    )
+    parser.add_argument(
+        "--community-type",
+        help="specify synthetic or simulated communities (currently only simulated is available)",
+        choices=[
+            "synthetic",
+            "simulated",
+            "all",
+        ],
+        required=True,
+    )
+    parser.add_argument(
+        "--community-sizes",
+        help="specify a community size to download from",
+        choices=[
+            "78Mbp",
+            "156Mbp",
+            "312Mbp",
+            "625Mbp",
+            "1250Mbp",
+            "2500Mbp",
+            "5000Mbp",
+            "10000Mbp",
+            "all",
+        ],
+        required=True,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--file-names",
+        help="specify a file name to download",
+        choices=[
+            "README.md",
+            "reference_assignments.tsv.gz",
+            "metagenome.fna.gz",
+            "master.tsv.gz",
+            "control_reads.tsv.gz",
+            "control_contigs.tsv.gz",
+            "unclustered_recruitment.tsv.gz",
+            "binning.tsv.gz",
+            "taxonomy.tsv.gz",
+            "lengths.tsv.gz",
+            "coverages.tsv.gz",
+            "gc_content.tsv.gz",
+            "kmers.embedded.tsv.gz",
+            "kmers.tsv.gz",
+            "markers.tsv.gz",
+            "Bacteria.fna.gz",
+            "orfs.faa.gz",
+            "metagenome.filtered.fna.gz",
+            "hmmscan.tsv.gz",
+            "forward_reads.fastq.gz",
+            "reverse_reads.fastq.gz",
+            "all",
+        ],
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--dir-path",
+        help="specify a folder to start the download (several directories will be generated within this folder)",
+        required=True,
+    )
+    parser.add_argument(
+        "--host",
+        help="IP address to ping when checking internet connectivity. Note: Will attempt to connect to port 53 on host address (Default is google.com)",
+        default="8.8.8.8",
+    )
+    args = parser.parse_args()
+
+    if "all" in args.community_sizes:
+        community_sizes = (
+            "78Mbp",
+            "156Mbp",
+            "312Mbp",
+            "625Mbp",
+            "1250Mbp",
+            "2500Mbp",
+            "5000Mbp",
+            "10000Mbp",
+        )
+    else:
+        community_sizes = args.community_sizes
+    if "all" in args.file_names:
+        file_names = (
+            "README.md",
+            "reference_assignments.tsv.gz",
+            "metagenome.fna.gz",
+            "master.tsv.gz",
+            "control_reads.tsv.gz",
+            "control_contigs.tsv.gz",
+            "unclustered_recruitment.tsv.gz",
+            "binning.tsv.gz",
+            "taxonomy.tsv.gz",
+            "lengths.tsv.gz",
+            "coverages.tsv.gz",
+            "gc_content.tsv.gz",
+            "kmers.embedded.tsv.gz",
+            "kmers.tsv.gz",
+            "markers.tsv.gz",
+            "Bacteria.fna.gz",
+            "orfs.faa.gz",
+            "metagenome.filtered.fna.gz",
+            "hmmscan.tsv.gz",
+            "forward_reads.fastq.gz",
+            "reverse_reads.fastq.gz",
+        )
+    else:
+        file_names = args.file_names
+
+    if not internet_is_connected(host=args.host):
+        logger.error(
+            "No internet connection detected (couldn't ping google.com at IP 8.8.8.8). Please confirm connection. Downloader will still attempt to run. (Ping a custom IP address with --host argument)"
+        )
+
+    download(
+        community_type=args.community_type,
+        community_sizes=community_sizes,
+        file_names=file_names,
+        dir_path=args.dir_path,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/conf.py b/docs/source/conf.py
index bd9d48d07..97e12cfbb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,7 +20,7 @@
 autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]
 
 # fmt: off
-import parse_argparse  
+import parse_argparse
 
 # -- Project information -----------------------------------------------------
 
diff --git a/setup.py b/setup.py
index 6efd52400..ca52dc499 100644
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@ def read(fname):
             "autometa-binning = autometa.binning.recursive_dbscan:main",
             "autometa-unclustered-recruitment = autometa.binning.unclustered_recruitment:main",
             "autometa-binning-summary = autometa.binning.summary:main",
+            "autometa-download-dataset = autometa.validation.datasets:main",
         ]
     },
     author="Jason C. Kwan",
diff --git a/tests/unit_tests/test_summary.py b/tests/unit_tests/test_summary.py
index 81db0e281..14c42cef4 100644
--- a/tests/unit_tests/test_summary.py
+++ b/tests/unit_tests/test_summary.py
@@ -150,7 +150,8 @@ def return_metabin_taxonomies(*args, **kwargs):
 
 @pytest.mark.skip
 def test_get_metabin_taxonomies(
-    mock_rank_taxids, bin_df,
+    mock_rank_taxids,
+    bin_df,
 ):
     mock_ncbi = return_mock_ncbi()
     df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi)
diff --git a/tests/unit_tests/test_vote.py b/tests/unit_tests/test_vote.py
index 87c45cdd6..19c9a943e 100644
--- a/tests/unit_tests/test_vote.py
+++ b/tests/unit_tests/test_vote.py
@@ -117,14 +117,23 @@ def test_add_ranks(ncbi, votes, tmp_path):
 @pytest.mark.skip
 def test_vote_assign(blastp, ncbi_dir, prot_orfs, tmp_path):
     out = tmp_path / "votes.tsv"
-    votes = vote.assign(out=out, prot_orfs=prot_orfs, blast=blastp, ncbi_dir=ncbi_dir,)
+    votes = vote.assign(
+        out=out,
+        prot_orfs=prot_orfs,
+        blast=blastp,
+        ncbi_dir=ncbi_dir,
+    )
     assert isinstance(votes, pd.DataFrame)
     assert votes.index.name == "contig"
     assert "taxid" in votes.columns
 
 
 def test_get(ncbi, votes_fpath):
-    df = vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
+    df = vote.get(
+        filepath_or_dataframe=votes_fpath,
+        kingdom="bacteria",
+        ncbi=ncbi,
+    )
     # canonical ranks should have been added to table if they were not already in place.
     assert df.shape == (2, 8)
 
@@ -132,7 +141,9 @@ def test_get(ncbi, votes_fpath):
 def test_get_none_recovered(ncbi, votes_fpath):
     with pytest.raises(KeyError):
         vote.get(
-            filepath_or_dataframe=votes_fpath, kingdom="archaea", ncbi=ncbi,
+            filepath_or_dataframe=votes_fpath,
+            kingdom="archaea",
+            ncbi=ncbi,
         )
 
 
@@ -140,7 +151,9 @@ def test_get_empty_votes(ncbi_dir, tmp_path):
     fpath = tmp_path / "votes.tsv"
     with pytest.raises(FileNotFoundError):
         vote.get(
-            filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi_dir,
+            filepath_or_dataframe=fpath,
+            kingdom="archaea",
+            ncbi=ncbi_dir,
         )
 
 
@@ -153,13 +166,19 @@ def return_df(*args, **kwargs):
     monkeypatch.setattr(vote, "add_ranks", return_df, raising=True)
     with pytest.raises(TableFormatError):
         vote.get(
-            filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi,
+            filepath_or_dataframe=fpath,
+            kingdom="archaea",
+            ncbi=ncbi,
         )
 
 
 @pytest.fixture(name="ranks_added_votes", scope="module")
 def fixture_ranks_added_votes(votes_fpath, ncbi):
-    return vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
+    return vote.get(
+        filepath_or_dataframe=votes_fpath,
+        kingdom="bacteria",
+        ncbi=ncbi,
+    )
 
 
 @pytest.mark.parametrize(
@@ -217,7 +236,10 @@ def test_write_ranks_no_taxonomy_columns(tmp_path, votes):
     assembly = dirpath / "assembly.fna"
     with pytest.raises(KeyError):
         vote.write_ranks(
-            taxonomy=votes, assembly=assembly, outdir=dirpath, rank="superkingdom",
+            taxonomy=votes,
+            assembly=assembly,
+            outdir=dirpath,
+            rank="superkingdom",
         )