KwanLab · jason-c-kwan · Aug 5, 2021 · Mar 25, 2021 · Mar 25, 2021 · Apr 29, 2021
diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py
@@ -323,7 +323,9 @@ def main():
     )
     # Now retrieve stats for each metabin
     metabin_stats_df = get_metabin_stats(
-        bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column,
+        bin_df=bin_df,
+        markers_fpath=args.markers,
+        cluster_col=args.binning_column,
     )
     metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True)
     logger.info(f"Wrote metabin stats to {args.output_stats}")

diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
@@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
         raise NotImplementedError(classifier)
 
     df = pd.DataFrame(
-        predictions, index=test_data.index, columns=train_data.target_names,
+        predictions,
+        index=test_data.index,
+        columns=train_data.target_names,
     )
     # Filter predictions by confidence threshold
     confidence_threshold = num_classifications * confidence

diff --git a/autometa/common/external/download_dataset.py b/autometa/common/external/download_dataset.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+COPYRIGHT
+Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
+Shaurya Chanana, Izaak Miller, Jason C. Kwan
+
+This file is part of Autometa.
+
+Autometa is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Autometa is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with Autometa. If not, see <http://www.gnu.org/licenses/>.
+COPYRIGHT
+
+pulling data from google drive folder with simulated or synthetic communities
+"""
+
+
+import gdown
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def download_dataset(dataset, out_dirpath):
+    # provide list of database options as a dictionary with file_ids from google
+    simulated = {
+        "test": "1fy3M7RnS_HGSQVKidCy-rAwXuxldyOOv",
+        "78": "15CB8rmQaHTGy7gWtZedfBJkrwr51bb2y",
+        "156": "13bkwFBIUhdWVWlAmVCimDODWF-7tRxgI",
+        "312": "1qyAu-m6NCNuVlDFFC10waOD28j15yfV-",
+        "625": "1FgMXSD50ggu0UJbZd1PM_AvLt-E7gJix",
+        "1250": "1KoxwxBAYcz8Xz9H2v17N9CHOZ-WXWS5m",
+        "2500": "1wKZytjC4zjTuhHdNUyAT6wVbuDDIwk2m",
+        "5000": "1IX6vLfBptPxhL44dLa6jePs-GRw2XJ3S",
+        "10000": "1ON2vxEWC5FHyyPqlfZ0znMgnQ1fTirqG",
+    }
+
+    # construct file id into a url to put into gdown
+    file_id = simulated[dataset]
+    url = f"https://drive.google.com/uc?id={file_id}"
+    filename = f"{dataset}_metagenome.fna.gz"
+    out_fpath = os.path.join(out_dirpath, filename)
+
+    # download the specified file with gdown
+    gdown.download(url, out_fpath)
+
+
+def main():
+    import argparse
+    import logging as logger
+
+    logger.basicConfig(
+        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
+        datefmt="%m/%d/%Y %I:%M:%S %p",
+        level=logger.DEBUG,
+    )
+
+    parser = argparse.ArgumentParser(
+        prog="autometa-download-dataset",
+        description="Download a simulated community file from google drive to a specified directory",
+    )
+    parser.add_argument(
+        "--dataset",
+        help="specify a size of simulated community in megabase pairs",
+        choices=["78", "156", "312", "625", "1250", "2500", "5000", "10000", "test"],
+        required=True,
+    )
+    parser.add_argument(
+        "--out_dirpath",
+        help="specify the directory to download the file",
+        required=True,
+    )
+    args = parser.parse_args()
+
+    download_dataset(args.dataset, args.out_dirpath)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -20,7 +20,7 @@
 autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]
 
 # fmt: off
-import parse_argparse  
+import parse_argparse
 
 # -- Project information -----------------------------------------------------
 

diff --git a/tests/unit_tests/test_summary.py b/tests/unit_tests/test_summary.py
@@ -150,7 +150,8 @@ def return_metabin_taxonomies(*args, **kwargs):
 
 @pytest.mark.skip
 def test_get_metabin_taxonomies(
-    mock_rank_taxids, bin_df,
+    mock_rank_taxids,
+    bin_df,
 ):
     mock_ncbi = return_mock_ncbi()
     df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi)

diff --git a/tests/unit_tests/test_vote.py b/tests/unit_tests/test_vote.py
@@ -117,30 +117,43 @@ def test_add_ranks(ncbi, votes, tmp_path):
 @pytest.mark.skip
 def test_vote_assign(blastp, ncbi_dir, prot_orfs, tmp_path):
     out = tmp_path / "votes.tsv"
-    votes = vote.assign(out=out, prot_orfs=prot_orfs, blast=blastp, ncbi_dir=ncbi_dir,)
+    votes = vote.assign(
+        out=out,
+        prot_orfs=prot_orfs,
+        blast=blastp,
+        ncbi_dir=ncbi_dir,
+    )
     assert isinstance(votes, pd.DataFrame)
     assert votes.index.name == "contig"
     assert "taxid" in votes.columns
 
 
 def test_get(ncbi, votes_fpath):
-    df = vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
+    df = vote.get(
+        filepath_or_dataframe=votes_fpath,
+        kingdom="bacteria",
+        ncbi=ncbi,
+    )
     # canonical ranks should have been added to table if they were not already in place.
     assert df.shape == (2, 8)
 
 
 def test_get_none_recovered(ncbi, votes_fpath):
     with pytest.raises(KeyError):
         vote.get(
-            filepath_or_dataframe=votes_fpath, kingdom="archaea", ncbi=ncbi,
+            filepath_or_dataframe=votes_fpath,
+            kingdom="archaea",
+            ncbi=ncbi,
         )
 
 
 def test_get_empty_votes(ncbi_dir, tmp_path):
     fpath = tmp_path / "votes.tsv"
     with pytest.raises(FileNotFoundError):
         vote.get(
-            filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi_dir,
+            filepath_or_dataframe=fpath,
+            kingdom="archaea",
+            ncbi=ncbi_dir,
         )
 
 
@@ -153,13 +166,19 @@ def return_df(*args, **kwargs):
     monkeypatch.setattr(vote, "add_ranks", return_df, raising=True)
     with pytest.raises(TableFormatError):
         vote.get(
-            filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi,
+            filepath_or_dataframe=fpath,
+            kingdom="archaea",
+            ncbi=ncbi,
         )
 
 
 @pytest.fixture(name="ranks_added_votes", scope="module")
 def fixture_ranks_added_votes(votes_fpath, ncbi):
-    return vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
+    return vote.get(
+        filepath_or_dataframe=votes_fpath,
+        kingdom="bacteria",
+        ncbi=ncbi,
+    )
 
 
 @pytest.mark.parametrize(
@@ -217,7 +236,10 @@ def test_write_ranks_no_taxonomy_columns(tmp_path, votes):
     assembly = dirpath / "assembly.fna"
     with pytest.raises(KeyError):
         vote.write_ranks(
-            taxonomy=votes, assembly=assembly, outdir=dirpath, rank="superkingdom",
+            taxonomy=votes,
+            assembly=assembly,
+            outdir=dirpath,
+            rank="superkingdom",
         )