Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add feature to download google drive datasets #138

Merged
merged 37 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
237d4a9
Add feature to download google drive datasets
ajlail98 Mar 25, 2021
c6fd46d
Add gdown to requirements.txt
ajlail98 Mar 25, 2021
751bd7c
:art: pulling from upstream dev
ajlail98 Apr 29, 2021
b65d14e
:art: Formatted script according to template, renamed variables
ajlail98 May 25, 2021
d7afab6
:art: Changed permissions
ajlail98 May 25, 2021
0f35181
:art: Added unique filenames for each file size
ajlail98 May 25, 2021
fbb6c73
:art: Moved to external folder
ajlail98 May 25, 2021
72f0c99
Moved script to validation and renamed
ajlail98 Jun 4, 2021
bd92f5e
Rename function and add type hints
ajlail98 Jun 4, 2021
ad70968
Add file containing fileIDs to reference
ajlail98 Jun 4, 2021
b7df5f5
Add user input options for files/folders
ajlail98 Jun 9, 2021
0abe3a6
Reformat with black
ajlail98 Jun 9, 2021
df63e97
Change targets variable name
ajlail98 Jun 10, 2021
79484a5
Change "folder" to "dataset"
ajlail98 Jun 10, 2021
662d5bf
Update column names
ajlail98 Jun 10, 2021
7678155
Condense logic into one function
ajlail98 Jun 11, 2021
3ffd397
Change logic to input multiple files and multiple output dirs
ajlail98 Jun 11, 2021
46eafc2
Add logger warnings
ajlail98 Jun 15, 2021
d21f825
Add datasets.py info to setup.py
ajlail98 Jun 15, 2021
54d151d
Change internet_is_connected into an import
ajlail98 Jun 24, 2021
3dd9e63
Add internet connection checker and error message
ajlail98 Jun 24, 2021
2a45ab2
Directory structure to organize downloads
ajlail98 Jul 13, 2021
b7c2048
Change variable names and clean up extra bits
ajlail98 Jul 13, 2021
9a932d5
Add __init__.py to validation
ajlail98 Jul 13, 2021
98e356b
Add error for non-existent dir_path
ajlail98 Jul 13, 2021
0d1274b
Add detail to internet_is_connected failure
ajlail98 Jul 14, 2021
7af3c95
Added NotImplementedError
ajlail98 Jul 16, 2021
df317b0
Only read csv once
ajlail98 Jul 16, 2021
85c9387
Change strategy for filtering df
ajlail98 Jul 16, 2021
12afe4b
Using df.loc to retrieve file_id
ajlail98 Jul 16, 2021
e7da939
Argparse and var name refinements
ajlail98 Jul 16, 2021
dceb0f5
Add ability to ping custom IP
ajlail98 Jul 20, 2021
622d934
Reformatting
ajlail98 Jul 20, 2021
ac89c06
Hardcode fileID csv hosted on google drive
ajlail98 Jul 22, 2021
af931bb
Reformatting
ajlail98 Jul 22, 2021
33f75e1
Remove gdown_fileIDs.csv
ajlail98 Jul 22, 2021
7fdf590
Add verbose error message and dockerfile entrypoint
ajlail98 Jul 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,9 @@ def main():
)
# Now retrieve stats for each metabin
metabin_stats_df = get_metabin_stats(
bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column,
bin_df=bin_df,
markers_fpath=args.markers,
cluster_col=args.binning_column,
)
metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True)
logger.info(f"Wrote metabin stats to {args.output_stats}")
Expand Down
4 changes: 3 additions & 1 deletion autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
raise NotImplementedError(classifier)

df = pd.DataFrame(
predictions, index=test_data.index, columns=train_data.target_names,
predictions,
index=test_data.index,
columns=train_data.target_names,
)
# Filter predictions by confidence threshold
chasemc marked this conversation as resolved.
Show resolved Hide resolved
confidence_threshold = num_classifications * confidence
Expand Down
90 changes: 90 additions & 0 deletions autometa/common/external/download_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
COPYRIGHT
Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
Shaurya Chanana, Izaak Miller, Jason C. Kwan

This file is part of Autometa.

Autometa is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Autometa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with Autometa. If not, see <http://www.gnu.org/licenses/>.
COPYRIGHT

pulling data from google drive folder with simulated or synthetic communities
"""


import gdown
import os
import logging

logger = logging.getLogger(__name__)


def download_dataset(dataset, out_dirpath):
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
# provide list of database options as a dictionary with file_ids from google
simulated = {
"test": "1fy3M7RnS_HGSQVKidCy-rAwXuxldyOOv",
"78": "15CB8rmQaHTGy7gWtZedfBJkrwr51bb2y",
"156": "13bkwFBIUhdWVWlAmVCimDODWF-7tRxgI",
"312": "1qyAu-m6NCNuVlDFFC10waOD28j15yfV-",
"625": "1FgMXSD50ggu0UJbZd1PM_AvLt-E7gJix",
"1250": "1KoxwxBAYcz8Xz9H2v17N9CHOZ-WXWS5m",
"2500": "1wKZytjC4zjTuhHdNUyAT6wVbuDDIwk2m",
"5000": "1IX6vLfBptPxhL44dLa6jePs-GRw2XJ3S",
"10000": "1ON2vxEWC5FHyyPqlfZ0znMgnQ1fTirqG",
}
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved

# construct file id into a url to put into gdown
file_id = simulated[dataset]
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
url = f"https://drive.google.com/uc?id={file_id}"
filename = f"{dataset}_metagenome.fna.gz"
out_fpath = os.path.join(out_dirpath, filename)
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved

# download the specified file with gdown
gdown.download(url, out_fpath)


def main():
import argparse
import logging as logger

logger.basicConfig(
format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)

parser = argparse.ArgumentParser(
prog="autometa-download-dataset",
description="Download a simulated community file from google drive to a specified directory",
)
parser.add_argument(
"--dataset",
help="specify a size of simulated community in megabase pairs",
choices=["78", "156", "312", "625", "1250", "2500", "5000", "10000", "test"],
required=True,
)
parser.add_argument(
"--out_dirpath",
help="specify the directory to download the file",
required=True,
)
args = parser.parse_args()

download_dataset(args.dataset, args.out_dirpath)
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]

# fmt: off
import parse_argparse
import parse_argparse

# -- Project information -----------------------------------------------------

Expand Down
3 changes: 2 additions & 1 deletion tests/unit_tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ def return_metabin_taxonomies(*args, **kwargs):

@pytest.mark.skip
def test_get_metabin_taxonomies(
mock_rank_taxids, bin_df,
mock_rank_taxids,
bin_df,
):
mock_ncbi = return_mock_ncbi()
chasemc marked this conversation as resolved.
Show resolved Hide resolved
df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi)
Expand Down
36 changes: 29 additions & 7 deletions tests/unit_tests/test_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,30 +117,43 @@ def test_add_ranks(ncbi, votes, tmp_path):
@pytest.mark.skip
def test_vote_assign(blastp, ncbi_dir, prot_orfs, tmp_path):
out = tmp_path / "votes.tsv"
votes = vote.assign(out=out, prot_orfs=prot_orfs, blast=blastp, ncbi_dir=ncbi_dir,)
votes = vote.assign(
out=out,
prot_orfs=prot_orfs,
blast=blastp,
ncbi_dir=ncbi_dir,
)
assert isinstance(votes, pd.DataFrame)
assert votes.index.name == "contig"
assert "taxid" in votes.columns


def test_get(ncbi, votes_fpath):
df = vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
df = vote.get(
filepath_or_dataframe=votes_fpath,
kingdom="bacteria",
ncbi=ncbi,
)
# canonical ranks should have been added to table if they were not already in place.
assert df.shape == (2, 8)


def test_get_none_recovered(ncbi, votes_fpath):
with pytest.raises(KeyError):
vote.get(
filepath_or_dataframe=votes_fpath, kingdom="archaea", ncbi=ncbi,
filepath_or_dataframe=votes_fpath,
kingdom="archaea",
ncbi=ncbi,
)


def test_get_empty_votes(ncbi_dir, tmp_path):
fpath = tmp_path / "votes.tsv"
with pytest.raises(FileNotFoundError):
vote.get(
filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi_dir,
filepath_or_dataframe=fpath,
kingdom="archaea",
ncbi=ncbi_dir,
)


Expand All @@ -153,13 +166,19 @@ def return_df(*args, **kwargs):
monkeypatch.setattr(vote, "add_ranks", return_df, raising=True)
with pytest.raises(TableFormatError):
vote.get(
filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi,
filepath_or_dataframe=fpath,
kingdom="archaea",
ncbi=ncbi,
)


@pytest.fixture(name="ranks_added_votes", scope="module")
def fixture_ranks_added_votes(votes_fpath, ncbi):
return vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
return vote.get(
filepath_or_dataframe=votes_fpath,
kingdom="bacteria",
ncbi=ncbi,
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -217,7 +236,10 @@ def test_write_ranks_no_taxonomy_columns(tmp_path, votes):
assembly = dirpath / "assembly.fna"
with pytest.raises(KeyError):
vote.write_ranks(
taxonomy=votes, assembly=assembly, outdir=dirpath, rank="superkingdom",
taxonomy=votes,
assembly=assembly,
outdir=dirpath,
rank="superkingdom",
)

chasemc marked this conversation as resolved.
Show resolved Hide resolved

Expand Down