Add feature to download google drive datasets (#138)

* Add feature to download google drive datasets Issue #110 * Add gdown to requirements.txt * 🎨 Formatted script according to template, renamed variables * 🎨 Changed permissions * 🎨 Added unique filenames for each file size * 🎨 Moved to external folder * Moved script to validation and renamed * Rename function and add type hints * Add file containing fileIDs to reference * Add user input options for files/folders * Reformat with black * Change targets variable name * Change "folder" to "dataset" * Update column names * Condense logic into one function * Change logic to input multiple files and multiple output dirs * Add logger warnings * Add datasets.py info to setup.py * Change internet_is_connected into an import * Add internet connection checker and error message * Directory structure to organize downloads * Change variable names and clean up extra bits * Add __init__.py to validation * Add error for non-existent dir_path * Add detail to internet_is_connected failure * Added NotImplementedError * Only read csv once * Change strategy for filtering df * Using df.loc to retrieve file_id * Argparse and var name refinements * Add ability to ping custom IP * Reformatting * Hardcode fileID csv hosted on google drive * Reformatting * Remove gdown_fileIDs.csv * Add verbose error message and dockerfile entrypoint
KwanLab · Aug 5, 2021 · 8a64e7a · 8a64e7a
1 parent 4f98acc
commit 8a64e7a
Show file tree

Hide file tree

Showing 12 changed files with 281 additions and 26 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 exclude: "^autometa/validation"
 repos:
   - repo: https://github.com/psf/black
-    rev: stable
+    rev: 21.5b2
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.1.0 # Use the ref you want to point at
+    rev: v4.0.1 # Use the ref you want to point at
     hooks:
       - id: trailing-whitespace
       - id: check-executables-have-shebangs

diff --git a/Dockerfile b/Dockerfile
@@ -57,4 +57,5 @@ RUN echo "Checking autometa entrypoints" \
     && autometa-taxonomy-majority-vote -h > /dev/null \
     && autometa-binning -h > /dev/null \
     && autometa-unclustered-recruitment -h > /dev/null \
-    && autometa-binning-summary -h > /dev/null
+    && autometa-binning-summary -h > /dev/null \
+    && autometa-download-dataset -h > /dev/null
diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py
@@ -323,7 +323,9 @@ def main():
     )
     # Now retrieve stats for each metabin
     metabin_stats_df = get_metabin_stats(
-        bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column,
+        bin_df=bin_df,
+        markers_fpath=args.markers,
+        cluster_col=args.binning_column,
     )
     metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True)
     logger.info(f"Wrote metabin stats to {args.output_stats}")

diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
@@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
         raise NotImplementedError(classifier)
 
     df = pd.DataFrame(
-        predictions, index=test_data.index, columns=train_data.target_names,
+        predictions,
+        index=test_data.index,
+        columns=train_data.target_names,
     )
     # Filter predictions by confidence threshold
     confidence_threshold = num_classifications * confidence

diff --git a/autometa/common/utilities.py b/autometa/common/utilities.py
@@ -29,6 +29,7 @@
 import logging
 import os
 import pickle
+import socket
 import sys
 import tarfile
 import time
@@ -444,6 +445,18 @@ def wrapper(*args, **kwds):
     return wrapper
 
 
+def internet_is_connected(
+    host: str = "8.8.8.8", port: int = 53, timeout: int = 2
+) -> bool:
+    # google.com
+    try:
+        socket.setdefaulttimeout(timeout)
+        socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
+        return True
+    except socket.error:
+        return False
+
+
 if __name__ == "__main__":
     print(
         "This file contains utilities for Autometa pipeline and should not be run directly!"

diff --git a/autometa/config/databases.py b/autometa/config/databases.py
@@ -42,6 +42,7 @@
 from autometa.common.utilities import calc_checksum
 from autometa.common.utilities import read_checksum
 from autometa.common.utilities import write_checksum
+from autometa.common.utilities import internet_is_connected
 from autometa.common.exceptions import ChecksumMismatchError
 from autometa.common.external import diamond
 from autometa.common.external import hmmer
@@ -183,17 +184,6 @@ def satisfied(self, section: str = None, compare_checksums: bool = False) -> boo
             any_invalid = {}
         return not any_missing and not any_invalid
 
-    def internet_is_connected(
-        self, host: str = "8.8.8.8", port: int = 53, timeout: int = 2
-    ) -> bool:
-        # google.com
-        try:
-            socket.setdefaulttimeout(timeout)
-            socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
-            return True
-        except socket.error:
-            return False
-
     def get_remote_checksum(self, section: str, option: str) -> str:
         """Get the checksum from provided `section` respective to `option` in
         `self.config`.
@@ -226,7 +216,7 @@ def get_remote_checksum(self, section: str, option: str) -> str:
             raise ValueError(
                 f"'section' must be 'ncbi' or 'markers'. Provided: {section}"
             )
-        if not self.internet_is_connected():
+        if not internet_is_connected():
             raise ConnectionError("Cannot connect to the internet")
         if section == "ncbi":
             host = self.config.get(section, "host")

diff --git a/autometa/validation/__init__.py b/autometa/validation/__init__.py
diff --git a/autometa/validation/datasets.py b/autometa/validation/datasets.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+COPYRIGHT
+Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
+Shaurya Chanana, Izaak Miller, Jason C. Kwan
+
+This file is part of Autometa.
+
+Autometa is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Autometa is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with Autometa. If not, see <http://www.gnu.org/licenses/>.
+COPYRIGHT
+
+pulling data from google drive dataset with simulated or synthetic communities
+"""
+
+
+import gdown
+import os
+import sys
+import logging
+import pandas as pd
+
+from autometa.common.utilities import internet_is_connected
+
+logger = logging.getLogger(__name__)
+
+
+def download(
+    community_type: str, community_sizes: list, file_names: list, dir_path: str
+) -> None:
+
+    """Downloads the files specified in a dictionary.
+
+    Parameters
+    ----------
+    community_type : str
+        specifies the type of dataset that the user would like to download from
+    community_sizes : list
+        specifies the size of dataset that the user would like to download
+    file_names : list
+        specifies the file(s) that the user would like to download
+    dir_path : str
+        output path where the user wants to download the file(s)
+
+    Returns
+    -------
+    None
+        download is completed through gdown
+
+    """
+
+    if community_type == "synthetic" or community_type == "all":
+        raise NotImplementedError
+
+    # points to csv file on google drive
+    df = pd.read_csv(
+        "https://drive.google.com/uc?id=148fUO7jocoNOBUl2K4bCfjsbd42QxCzX",
+        dtype=str,
+        index_col=["dataset", "file"],
+    )
+
+    for community_size in community_sizes:
+        community_size_outdir = os.path.join(dir_path, community_size)
+        # make a new directory
+        if not os.path.exists(community_size_outdir):
+            os.makedirs(community_size_outdir)
+
+        for file_name in file_names:
+            file_id = df.loc[(community_size, file_name), "file_id"]
+            file_id_filepath = os.path.join(community_size_outdir, file_name)
+            url = f"https://drive.google.com/uc?id={file_id}"
+
+            gdown.download(url, file_id_filepath)
+
+
+def main():
+    import argparse
+    import logging as logger
+
+    logger.basicConfig(
+        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
+        datefmt="%m/%d/%Y %I:%M:%S %p",
+        level=logger.DEBUG,
+    )
+
+    parser = argparse.ArgumentParser(
+        description="Download a simulated community file from google drive to a specified output directory"
+    )
+    parser.add_argument(
+        "--community-type",
+        help="specify synthetic or simulated communities (currently only simulated is available)",
+        choices=[
+            "synthetic",
+            "simulated",
+            "all",
+        ],
+        required=True,
+    )
+    parser.add_argument(
+        "--community-sizes",
+        help="specify a community size to download from",
+        choices=[
+            "78Mbp",
+            "156Mbp",
+            "312Mbp",
+            "625Mbp",
+            "1250Mbp",
+            "2500Mbp",
+            "5000Mbp",
+            "10000Mbp",
+            "all",
+        ],
+        required=True,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--file-names",
+        help="specify a file name to download",
+        choices=[
+            "README.md",
+            "reference_assignments.tsv.gz",
+            "metagenome.fna.gz",
+            "master.tsv.gz",
+            "control_reads.tsv.gz",
+            "control_contigs.tsv.gz",
+            "unclustered_recruitment.tsv.gz",
+            "binning.tsv.gz",
+            "taxonomy.tsv.gz",
+            "lengths.tsv.gz",
+            "coverages.tsv.gz",
+            "gc_content.tsv.gz",
+            "kmers.embedded.tsv.gz",
+            "kmers.tsv.gz",
+            "markers.tsv.gz",
+            "Bacteria.fna.gz",
+            "orfs.faa.gz",
+            "metagenome.filtered.fna.gz",
+            "hmmscan.tsv.gz",
+            "forward_reads.fastq.gz",
+            "reverse_reads.fastq.gz",
+            "all",
+        ],
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--dir-path",
+        help="specify a folder to start the download (several directories will be generated within this folder)",
+        required=True,
+    )
+    parser.add_argument(
+        "--host",
+        help="IP address to ping when checking internet connectivity. Note: Will attempt to connect to port 53 on host address (Default is google.com)",
+        default="8.8.8.8",
+    )
+    args = parser.parse_args()
+
+    if "all" in args.community_sizes:
+        community_sizes = (
+            "78Mbp",
+            "156Mbp",
+            "312Mbp",
+            "625Mbp",
+            "1250Mbp",
+            "2500Mbp",
+            "5000Mbp",
+            "10000Mbp",
+        )
+    else:
+        community_sizes = args.community_sizes
+    if "all" in args.file_names:
+        file_names = (
+            "README.md",
+            "reference_assignments.tsv.gz",
+            "metagenome.fna.gz",
+            "master.tsv.gz",
+            "control_reads.tsv.gz",
+            "control_contigs.tsv.gz",
+            "unclustered_recruitment.tsv.gz",
+            "binning.tsv.gz",
+            "taxonomy.tsv.gz",
+            "lengths.tsv.gz",
+            "coverages.tsv.gz",
+            "gc_content.tsv.gz",
+            "kmers.embedded.tsv.gz",
+            "kmers.tsv.gz",
+            "markers.tsv.gz",
+            "Bacteria.fna.gz",
+            "orfs.faa.gz",
+            "metagenome.filtered.fna.gz",
+            "hmmscan.tsv.gz",
+            "forward_reads.fastq.gz",
+            "reverse_reads.fastq.gz",
+        )
+    else:
+        file_names = args.file_names
+
+    if not internet_is_connected(host=args.host):
+        logger.error(
+            "No internet connection detected (couldn't ping google.com at IP 8.8.8.8). Please confirm connection. Downloader will still attempt to run. (Ping a custom IP address with --host argument)"
+        )
+
+    download(
+        community_type=args.community_type,
+        community_sizes=community_sizes,
+        file_names=file_names,
+        dir_path=args.dir_path,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -20,7 +20,7 @@
 autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]
 
 # fmt: off
-import parse_argparse  
+import parse_argparse
 
 # -- Project information -----------------------------------------------------
 

diff --git a/setup.py b/setup.py
@@ -37,6 +37,7 @@ def read(fname):
             "autometa-binning = autometa.binning.recursive_dbscan:main",
             "autometa-unclustered-recruitment = autometa.binning.unclustered_recruitment:main",
             "autometa-binning-summary = autometa.binning.summary:main",
+            "autometa-download-dataset = autometa.validation.datasets:main",
         ]
     },
     author="Jason C. Kwan",

diff --git a/tests/unit_tests/test_summary.py b/tests/unit_tests/test_summary.py
@@ -150,7 +150,8 @@ def return_metabin_taxonomies(*args, **kwargs):
 
 @pytest.mark.skip
 def test_get_metabin_taxonomies(
-    mock_rank_taxids, bin_df,
+    mock_rank_taxids,
+    bin_df,
 ):
     mock_ncbi = return_mock_ncbi()
     df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi)