Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add feature to download google drive datasets #138

Merged
merged 37 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
237d4a9
Add feature to download google drive datasets
ajlail98 Mar 25, 2021
c6fd46d
Add gdown to requirements.txt
ajlail98 Mar 25, 2021
751bd7c
:art: pulling from upstream dev
ajlail98 Apr 29, 2021
b65d14e
:art: Formatted script according to template, renamed variables
ajlail98 May 25, 2021
d7afab6
:art: Changed permissions
ajlail98 May 25, 2021
0f35181
:art: Added unique filenames for each file size
ajlail98 May 25, 2021
fbb6c73
:art: Moved to external folder
ajlail98 May 25, 2021
72f0c99
Moved script to validation and renamed
ajlail98 Jun 4, 2021
bd92f5e
Rename function and add type hints
ajlail98 Jun 4, 2021
ad70968
Add file containing fileIDs to reference
ajlail98 Jun 4, 2021
b7df5f5
Add user input options for files/folders
ajlail98 Jun 9, 2021
0abe3a6
Reformat with black
ajlail98 Jun 9, 2021
df63e97
Change targets variable name
ajlail98 Jun 10, 2021
79484a5
Change "folder" to "dataset"
ajlail98 Jun 10, 2021
662d5bf
Update column names
ajlail98 Jun 10, 2021
7678155
Condense logic into one function
ajlail98 Jun 11, 2021
3ffd397
Change logic to input multiple files and multiple output dirs
ajlail98 Jun 11, 2021
46eafc2
Add logger warnings
ajlail98 Jun 15, 2021
d21f825
Add datasets.py info to setup.py
ajlail98 Jun 15, 2021
54d151d
Change internet_is_connected into an import
ajlail98 Jun 24, 2021
3dd9e63
Add internet connection checker and error message
ajlail98 Jun 24, 2021
2a45ab2
Directory structure to organize downloads
ajlail98 Jul 13, 2021
b7c2048
Change variable names and clean up extra bits
ajlail98 Jul 13, 2021
9a932d5
Add __init__.py to validation
ajlail98 Jul 13, 2021
98e356b
Add error for non-existent dir_path
ajlail98 Jul 13, 2021
0d1274b
Add detail to internet_is_connected failure
ajlail98 Jul 14, 2021
7af3c95
Added NotImplementedError
ajlail98 Jul 16, 2021
df317b0
Only read csv once
ajlail98 Jul 16, 2021
85c9387
Change strategy for filtering df
ajlail98 Jul 16, 2021
12afe4b
Using df.loc to retrieve file_id
ajlail98 Jul 16, 2021
e7da939
Argparse and var name refinements
ajlail98 Jul 16, 2021
dceb0f5
Add ability to ping custom IP
ajlail98 Jul 20, 2021
622d934
Reformatting
ajlail98 Jul 20, 2021
ac89c06
Hardcode fileID csv hosted on google drive
ajlail98 Jul 22, 2021
af931bb
Reformatting
ajlail98 Jul 22, 2021
33f75e1
Remove gdown_fileIDs.csv
ajlail98 Jul 22, 2021
7fdf590
Add verbose error message and dockerfile entrypoint
ajlail98 Jul 30, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
exclude: "^autometa/validation"
repos:
- repo: https://github.com/psf/black
rev: stable
rev: 21.5b2
hooks:
- id: black
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.1.0 # Use the ref you want to point at
rev: v4.0.1 # Use the ref you want to point at
hooks:
- id: trailing-whitespace
- id: check-executables-have-shebangs
Expand Down
4 changes: 3 additions & 1 deletion autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,9 @@ def main():
)
# Now retrieve stats for each metabin
metabin_stats_df = get_metabin_stats(
bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column,
bin_df=bin_df,
markers_fpath=args.markers,
cluster_col=args.binning_column,
)
metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True)
logger.info(f"Wrote metabin stats to {args.output_stats}")
Expand Down
4 changes: 3 additions & 1 deletion autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
raise NotImplementedError(classifier)

df = pd.DataFrame(
predictions, index=test_data.index, columns=train_data.target_names,
predictions,
index=test_data.index,
columns=train_data.target_names,
)
# Filter predictions by confidence threshold
chasemc marked this conversation as resolved.
Show resolved Hide resolved
confidence_threshold = num_classifications * confidence
Expand Down
13 changes: 13 additions & 0 deletions autometa/common/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import logging
import os
import pickle
import socket
import sys
import tarfile
import time
Expand Down Expand Up @@ -444,6 +445,18 @@ def wrapper(*args, **kwds):
return wrapper


def internet_is_connected(
host: str = "8.8.8.8", port: int = 53, timeout: int = 2
jason-c-kwan marked this conversation as resolved.
Show resolved Hide resolved
) -> bool:
# google.com
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
return True
except socket.error:
return False


if __name__ == "__main__":
print(
"This file contains utilities for Autometa pipeline and should not be run directly!"
Expand Down
14 changes: 2 additions & 12 deletions autometa/config/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from autometa.common.utilities import calc_checksum
from autometa.common.utilities import read_checksum
from autometa.common.utilities import write_checksum
from autometa.common.utilities import internet_is_connected
from autometa.common.exceptions import ChecksumMismatchError
from autometa.common.external import diamond
from autometa.common.external import hmmer
Expand Down Expand Up @@ -183,17 +184,6 @@ def satisfied(self, section: str = None, compare_checksums: bool = False) -> boo
any_invalid = {}
return not any_missing and not any_invalid

def internet_is_connected(
self, host: str = "8.8.8.8", port: int = 53, timeout: int = 2
) -> bool:
# google.com
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
return True
except socket.error:
return False

def get_remote_checksum(self, section: str, option: str) -> str:
"""Get the checksum from provided `section` respective to `option` in
`self.config`.
Expand Down Expand Up @@ -226,7 +216,7 @@ def get_remote_checksum(self, section: str, option: str) -> str:
raise ValueError(
f"'section' must be 'ncbi' or 'markers'. Provided: {section}"
)
if not self.internet_is_connected():
if not internet_is_connected():
raise ConnectionError("Cannot connect to the internet")
if section == "ncbi":
host = self.config.get(section, "host")
Expand Down
Empty file added autometa/validation/__init__.py
Empty file.
217 changes: 217 additions & 0 deletions autometa/validation/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
COPYRIGHT
Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
Shaurya Chanana, Izaak Miller, Jason C. Kwan

This file is part of Autometa.

Autometa is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Autometa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with Autometa. If not, see <http://www.gnu.org/licenses/>.
COPYRIGHT

pulling data from google drive dataset with simulated or synthetic communities
"""


import gdown
import os
import sys
import logging
import pandas as pd

from autometa.common.utilities import internet_is_connected

logger = logging.getLogger(__name__)


def download(
community_type: str, community_sizes: list, file_names: list, dir_path: str
) -> None:

"""Downloads the files specified in a dictionary.

Parameters
----------
community_type : str
specifies the type of dataset that the user would like to download from
community_sizes : list
specifies the size of dataset that the user would like to download
file_names : list
specifies the file(s) that the user would like to download
dir_path : str
output path where the user wants to download the file(s)

Returns
-------
None
download is completed through gdown

"""

if community_type == "synthetic" or community_type == "all":
raise NotImplementedError

df = pd.read_csv("gdown_fileIDs.csv", dtype=str, index_col=["dataset", "file"])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure whether this should be hardcoded here. If this is hardcoded here, would it be worth specifying the github URL or google drive URL for this file instead of the file on the user's system?

  1. If the script has made it to this stage, it should be able to connect.
  2. This should resolve any issues of the script not being able to find the appropriate path to this file.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I didn't even test that! Good catch. I think you're right, it should redirect out somewhere we know the file is. Do you think github or google drive would be more appropriate? I am thinking google drive would be best since the url would never have to change. It should be a small enough file that we can just use pd.read_csv() directly, without any gdown. (However, I don't have permission to add things to the simulated communities folder where I think this file should be kept.)

for community_size in community_sizes:
community_size_outdir = os.path.join(dir_path, community_size)
# make a new directory
if not os.path.exists(community_size_outdir):
os.makedirs(community_size_outdir)

for file_name in file_names:
file_id = df.loc[(community_size, file_name), "file_id"]
file_id_filepath = os.path.join(community_size_outdir, file_name)
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, file_id_filepath)


def main():
import argparse
import logging as logger

logger.basicConfig(
format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)

parser = argparse.ArgumentParser(
description="Download a simulated community file from google drive to a specified output directory"
)
parser.add_argument(
"--community-type",
help="specify synthetic or simulated communities (currently only simulated is available)",
choices=[
"synthetic",
"simulated",
"all",
],
required=True,
)
parser.add_argument(
"--community-sizes",
help="specify a community size to download from",
choices=[
"78Mbp",
"156Mbp",
"312Mbp",
"625Mbp",
"1250Mbp",
"2500Mbp",
"5000Mbp",
"10000Mbp",
"all",
],
required=True,
nargs="+",
)
parser.add_argument(
"--file-names",
help="specify a file name to download",
choices=[
"README.md",
"reference_assignments.tsv.gz",
"metagenome.fna.gz",
"master.tsv.gz",
"control_reads.tsv.gz",
"control_contigs.tsv.gz",
"unclustered_recruitment.tsv.gz",
"binning.tsv.gz",
"taxonomy.tsv.gz",
"lengths.tsv.gz",
"coverages.tsv.gz",
"gc_content.tsv.gz",
"kmers.embedded.tsv.gz",
"kmers.tsv.gz",
"markers.tsv.gz",
"Bacteria.fna.gz",
"orfs.faa.gz",
"metagenome.filtered.fna.gz",
"hmmscan.tsv.gz",
"forward_reads.fastq.gz",
"reverse_reads.fastq.gz",
"all",
],
nargs="+",
required=True,
)
parser.add_argument(
"--dir-path",
help="specify a folder to start the download (several directories will be generated within this folder)",
required=True,
)
parser.add_argument(
"--host",
help="IP address to ping when checking internet connectivity. Note: Will attempt to connect to port 53 on host address (Default is google.com)",
default="8.8.8.8",
)
args = parser.parse_args()

if "all" in args.community_sizes:
community_sizes = (
"78Mbp",
"156Mbp",
"312Mbp",
"625Mbp",
"1250Mbp",
"2500Mbp",
"5000Mbp",
"10000Mbp",
)
else:
community_sizes = args.community_sizes
if "all" in args.file_names:
file_names = (
"README.md",
"reference_assignments.tsv.gz",
"metagenome.fna.gz",
"master.tsv.gz",
"control_reads.tsv.gz",
"control_contigs.tsv.gz",
"unclustered_recruitment.tsv.gz",
"binning.tsv.gz",
"taxonomy.tsv.gz",
"lengths.tsv.gz",
"coverages.tsv.gz",
"gc_content.tsv.gz",
"kmers.embedded.tsv.gz",
"kmers.tsv.gz",
"markers.tsv.gz",
"Bacteria.fna.gz",
"orfs.faa.gz",
"metagenome.filtered.fna.gz",
"hmmscan.tsv.gz",
"forward_reads.fastq.gz",
"reverse_reads.fastq.gz",
)
else:
file_names = args.file_names

if not internet_is_connected(host=args.host):
logger.error(
"No internet connection detected. Please confirm connection. Downloader will still attempt to run. (Ping a custom IP address with --host argument)"
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
)

download(
community_type=args.community_type,
community_sizes=community_sizes,
file_names=file_names,
dir_path=args.dir_path,
)


if __name__ == "__main__":
main()
Loading