Skip to content

Commit

Permalink
Add feature to download google drive datasets (#138)
Browse files Browse the repository at this point in the history
* Add feature to download google drive datasets
Issue #110

* Add gdown to requirements.txt

* 🎨 Formatted script according to template, renamed variables

* 🎨 Changed permissions

* 🎨 Added unique filenames for each file size

* 🎨 Moved to external folder

* Moved script to validation and renamed

* Rename function and add type hints

* Add file containing fileIDs to reference

* Add user input options for files/folders

* Reformat with black

* Change targets variable name

* Change "folder" to "dataset"

* Update column names

* Condense logic into one function

* Change logic to input multiple files and multiple output dirs

* Add logger warnings

* Add datasets.py info to setup.py

* Change internet_is_connected into an import

* Add internet connection checker and error message

* Directory structure to organize downloads

* Change variable names and clean up extra bits

* Add __init__.py to validation

* Add error for non-existent dir_path

* Add detail to internet_is_connected failure

* Added NotImplementedError

* Only read csv once

* Change strategy for filtering df

* Using df.loc to retrieve file_id

* Argparse and var name refinements

* Add ability to ping custom IP

* Reformatting

* Hardcode fileID csv hosted on google drive

* Reformatting

* Remove gdown_fileIDs.csv

* Add verbose error message and dockerfile entrypoint
  • Loading branch information
ajlail98 authored Aug 5, 2021
1 parent 4f98acc commit 8a64e7a
Show file tree
Hide file tree
Showing 12 changed files with 281 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
exclude: "^autometa/validation"
repos:
- repo: https://github.com/psf/black
rev: stable
rev: 21.5b2
hooks:
- id: black
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.1.0 # Use the ref you want to point at
rev: v4.0.1 # Use the ref you want to point at
hooks:
- id: trailing-whitespace
- id: check-executables-have-shebangs
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ RUN echo "Checking autometa entrypoints" \
&& autometa-taxonomy-majority-vote -h > /dev/null \
&& autometa-binning -h > /dev/null \
&& autometa-unclustered-recruitment -h > /dev/null \
&& autometa-binning-summary -h > /dev/null
&& autometa-binning-summary -h > /dev/null \
&& autometa-download-dataset -h > /dev/null
4 changes: 3 additions & 1 deletion autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,9 @@ def main():
)
# Now retrieve stats for each metabin
metabin_stats_df = get_metabin_stats(
bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column,
bin_df=bin_df,
markers_fpath=args.markers,
cluster_col=args.binning_column,
)
metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True)
logger.info(f"Wrote metabin stats to {args.output_stats}")
Expand Down
4 changes: 3 additions & 1 deletion autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
raise NotImplementedError(classifier)

df = pd.DataFrame(
predictions, index=test_data.index, columns=train_data.target_names,
predictions,
index=test_data.index,
columns=train_data.target_names,
)
# Filter predictions by confidence threshold
confidence_threshold = num_classifications * confidence
Expand Down
13 changes: 13 additions & 0 deletions autometa/common/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import logging
import os
import pickle
import socket
import sys
import tarfile
import time
Expand Down Expand Up @@ -444,6 +445,18 @@ def wrapper(*args, **kwds):
return wrapper


def internet_is_connected(
host: str = "8.8.8.8", port: int = 53, timeout: int = 2
) -> bool:
# google.com
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
return True
except socket.error:
return False


if __name__ == "__main__":
print(
"This file contains utilities for Autometa pipeline and should not be run directly!"
Expand Down
14 changes: 2 additions & 12 deletions autometa/config/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from autometa.common.utilities import calc_checksum
from autometa.common.utilities import read_checksum
from autometa.common.utilities import write_checksum
from autometa.common.utilities import internet_is_connected
from autometa.common.exceptions import ChecksumMismatchError
from autometa.common.external import diamond
from autometa.common.external import hmmer
Expand Down Expand Up @@ -183,17 +184,6 @@ def satisfied(self, section: str = None, compare_checksums: bool = False) -> boo
any_invalid = {}
return not any_missing and not any_invalid

def internet_is_connected(
self, host: str = "8.8.8.8", port: int = 53, timeout: int = 2
) -> bool:
# google.com
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
return True
except socket.error:
return False

def get_remote_checksum(self, section: str, option: str) -> str:
"""Get the checksum from provided `section` respective to `option` in
`self.config`.
Expand Down Expand Up @@ -226,7 +216,7 @@ def get_remote_checksum(self, section: str, option: str) -> str:
raise ValueError(
f"'section' must be 'ncbi' or 'markers'. Provided: {section}"
)
if not self.internet_is_connected():
if not internet_is_connected():
raise ConnectionError("Cannot connect to the internet")
if section == "ncbi":
host = self.config.get(section, "host")
Expand Down
Empty file added autometa/validation/__init__.py
Empty file.
223 changes: 223 additions & 0 deletions autometa/validation/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
COPYRIGHT
Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
Shaurya Chanana, Izaak Miller, Jason C. Kwan
This file is part of Autometa.
Autometa is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Autometa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Autometa. If not, see <http://www.gnu.org/licenses/>.
COPYRIGHT
pulling data from google drive dataset with simulated or synthetic communities
"""


import gdown
import os
import sys
import logging
import pandas as pd

from autometa.common.utilities import internet_is_connected

logger = logging.getLogger(__name__)


def download(
community_type: str, community_sizes: list, file_names: list, dir_path: str
) -> None:

"""Downloads the files specified in a dictionary.
Parameters
----------
community_type : str
specifies the type of dataset that the user would like to download from
community_sizes : list
specifies the size of dataset that the user would like to download
file_names : list
specifies the file(s) that the user would like to download
dir_path : str
output path where the user wants to download the file(s)
Returns
-------
None
download is completed through gdown
"""

if community_type == "synthetic" or community_type == "all":
raise NotImplementedError

# points to csv file on google drive
df = pd.read_csv(
"https://drive.google.com/uc?id=148fUO7jocoNOBUl2K4bCfjsbd42QxCzX",
dtype=str,
index_col=["dataset", "file"],
)

for community_size in community_sizes:
community_size_outdir = os.path.join(dir_path, community_size)
# make a new directory
if not os.path.exists(community_size_outdir):
os.makedirs(community_size_outdir)

for file_name in file_names:
file_id = df.loc[(community_size, file_name), "file_id"]
file_id_filepath = os.path.join(community_size_outdir, file_name)
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, file_id_filepath)


def main():
import argparse
import logging as logger

logger.basicConfig(
format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)

parser = argparse.ArgumentParser(
description="Download a simulated community file from google drive to a specified output directory"
)
parser.add_argument(
"--community-type",
help="specify synthetic or simulated communities (currently only simulated is available)",
choices=[
"synthetic",
"simulated",
"all",
],
required=True,
)
parser.add_argument(
"--community-sizes",
help="specify a community size to download from",
choices=[
"78Mbp",
"156Mbp",
"312Mbp",
"625Mbp",
"1250Mbp",
"2500Mbp",
"5000Mbp",
"10000Mbp",
"all",
],
required=True,
nargs="+",
)
parser.add_argument(
"--file-names",
help="specify a file name to download",
choices=[
"README.md",
"reference_assignments.tsv.gz",
"metagenome.fna.gz",
"master.tsv.gz",
"control_reads.tsv.gz",
"control_contigs.tsv.gz",
"unclustered_recruitment.tsv.gz",
"binning.tsv.gz",
"taxonomy.tsv.gz",
"lengths.tsv.gz",
"coverages.tsv.gz",
"gc_content.tsv.gz",
"kmers.embedded.tsv.gz",
"kmers.tsv.gz",
"markers.tsv.gz",
"Bacteria.fna.gz",
"orfs.faa.gz",
"metagenome.filtered.fna.gz",
"hmmscan.tsv.gz",
"forward_reads.fastq.gz",
"reverse_reads.fastq.gz",
"all",
],
nargs="+",
required=True,
)
parser.add_argument(
"--dir-path",
help="specify a folder to start the download (several directories will be generated within this folder)",
required=True,
)
parser.add_argument(
"--host",
help="IP address to ping when checking internet connectivity. Note: Will attempt to connect to port 53 on host address (Default is google.com)",
default="8.8.8.8",
)
args = parser.parse_args()

if "all" in args.community_sizes:
community_sizes = (
"78Mbp",
"156Mbp",
"312Mbp",
"625Mbp",
"1250Mbp",
"2500Mbp",
"5000Mbp",
"10000Mbp",
)
else:
community_sizes = args.community_sizes
if "all" in args.file_names:
file_names = (
"README.md",
"reference_assignments.tsv.gz",
"metagenome.fna.gz",
"master.tsv.gz",
"control_reads.tsv.gz",
"control_contigs.tsv.gz",
"unclustered_recruitment.tsv.gz",
"binning.tsv.gz",
"taxonomy.tsv.gz",
"lengths.tsv.gz",
"coverages.tsv.gz",
"gc_content.tsv.gz",
"kmers.embedded.tsv.gz",
"kmers.tsv.gz",
"markers.tsv.gz",
"Bacteria.fna.gz",
"orfs.faa.gz",
"metagenome.filtered.fna.gz",
"hmmscan.tsv.gz",
"forward_reads.fastq.gz",
"reverse_reads.fastq.gz",
)
else:
file_names = args.file_names

if not internet_is_connected(host=args.host):
logger.error(
"No internet connection detected (couldn't ping google.com at IP 8.8.8.8). Please confirm connection. Downloader will still attempt to run. (Ping a custom IP address with --host argument)"
)

download(
community_type=args.community_type,
community_sizes=community_sizes,
file_names=file_names,
dir_path=args.dir_path,
)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]

# fmt: off
import parse_argparse
import parse_argparse

# -- Project information -----------------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def read(fname):
"autometa-binning = autometa.binning.recursive_dbscan:main",
"autometa-unclustered-recruitment = autometa.binning.unclustered_recruitment:main",
"autometa-binning-summary = autometa.binning.summary:main",
"autometa-download-dataset = autometa.validation.datasets:main",
]
},
author="Jason C. Kwan",
Expand Down
3 changes: 2 additions & 1 deletion tests/unit_tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ def return_metabin_taxonomies(*args, **kwargs):

@pytest.mark.skip
def test_get_metabin_taxonomies(
mock_rank_taxids, bin_df,
mock_rank_taxids,
bin_df,
):
mock_ncbi = return_mock_ncbi()
df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi)
Expand Down
Loading

0 comments on commit 8a64e7a

Please sign in to comment.