diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b058c73de..66aa0f6ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ exclude: "^autometa/validation" repos: - repo: https://github.com/psf/black - rev: stable + rev: 21.5b2 hooks: - id: black - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.1.0 # Use the ref you want to point at + rev: v4.0.1 # Use the ref you want to point at hooks: - id: trailing-whitespace - id: check-executables-have-shebangs diff --git a/Dockerfile b/Dockerfile index 4352d49a7..cafefbb28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,4 +57,5 @@ RUN echo "Checking autometa entrypoints" \ && autometa-taxonomy-majority-vote -h > /dev/null \ && autometa-binning -h > /dev/null \ && autometa-unclustered-recruitment -h > /dev/null \ - && autometa-binning-summary -h > /dev/null + && autometa-binning-summary -h > /dev/null \ + && autometa-download-dataset -h > /dev/null diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py index 1ad39ca46..cb4746bc8 100644 --- a/autometa/binning/summary.py +++ b/autometa/binning/summary.py @@ -323,7 +323,9 @@ def main(): ) # Now retrieve stats for each metabin metabin_stats_df = get_metabin_stats( - bin_df=bin_df, markers_fpath=args.markers, cluster_col=args.binning_column, + bin_df=bin_df, + markers_fpath=args.markers, + cluster_col=args.binning_column, ) metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True) logger.info(f"Wrote metabin stats to {args.output_stats}") diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py index 5e5bed120..07cd5c1cf 100644 --- a/autometa/binning/unclustered_recruitment.py +++ b/autometa/binning/unclustered_recruitment.py @@ -400,7 +400,9 @@ def get_confidence_filtered_predictions( raise NotImplementedError(classifier) df = pd.DataFrame( - predictions, index=test_data.index, columns=train_data.target_names, + predictions, + index=test_data.index, + columns=train_data.target_names, ) # Filter predictions by confidence threshold confidence_threshold = num_classifications * confidence diff --git a/autometa/common/utilities.py b/autometa/common/utilities.py index c183bf8d4..2cd0dce3b 100644 --- a/autometa/common/utilities.py +++ b/autometa/common/utilities.py @@ -29,6 +29,7 @@ import logging import os import pickle +import socket import sys import tarfile import time @@ -444,6 +445,18 @@ def wrapper(*args, **kwds): return wrapper +def internet_is_connected( + host: str = "8.8.8.8", port: int = 53, timeout: int = 2 +) -> bool: + # google.com + try: + socket.setdefaulttimeout(timeout) + socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port)) + return True + except socket.error: + return False + + if __name__ == "__main__": print( "This file contains utilities for Autometa pipeline and should not be run directly!" diff --git a/autometa/config/databases.py b/autometa/config/databases.py index 5848db647..3f9d3771e 100644 --- a/autometa/config/databases.py +++ b/autometa/config/databases.py @@ -42,6 +42,7 @@ from autometa.common.utilities import calc_checksum from autometa.common.utilities import read_checksum from autometa.common.utilities import write_checksum +from autometa.common.utilities import internet_is_connected from autometa.common.exceptions import ChecksumMismatchError from autometa.common.external import diamond from autometa.common.external import hmmer @@ -183,17 +184,6 @@ def satisfied(self, section: str = None, compare_checksums: bool = False) -> boo any_invalid = {} return not any_missing and not any_invalid - def internet_is_connected( - self, host: str = "8.8.8.8", port: int = 53, timeout: int = 2 - ) -> bool: - # google.com - try: - socket.setdefaulttimeout(timeout) - socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port)) - return True - except socket.error: - return False - def get_remote_checksum(self, section: str, option: str) -> str: """Get the checksum from provided `section` respective to `option` in `self.config`. @@ -226,7 +216,7 @@ def get_remote_checksum(self, section: str, option: str) -> str: raise ValueError( f"'section' must be 'ncbi' or 'markers'. Provided: {section}" ) - if not self.internet_is_connected(): + if not internet_is_connected(): raise ConnectionError("Cannot connect to the internet") if section == "ncbi": host = self.config.get(section, "host") diff --git a/autometa/validation/__init__.py b/autometa/validation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autometa/validation/datasets.py b/autometa/validation/datasets.py new file mode 100755 index 000000000..69009618e --- /dev/null +++ b/autometa/validation/datasets.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +COPYRIGHT +Copyright 2021 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal, +Shaurya Chanana, Izaak Miller, Jason C. Kwan + +This file is part of Autometa. + +Autometa is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Autometa is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with Autometa. If not, see . +COPYRIGHT + +pulling data from google drive dataset with simulated or synthetic communities +""" + + +import gdown +import os +import sys +import logging +import pandas as pd + +from autometa.common.utilities import internet_is_connected + +logger = logging.getLogger(__name__) + + +def download( + community_type: str, community_sizes: list, file_names: list, dir_path: str +) -> None: + + """Downloads the files specified in a dictionary. + + Parameters + ---------- + community_type : str + specifies the type of dataset that the user would like to download from + community_sizes : list + specifies the size of dataset that the user would like to download + file_names : list + specifies the file(s) that the user would like to download + dir_path : str + output path where the user wants to download the file(s) + + Returns + ------- + None + download is completed through gdown + + """ + + if community_type == "synthetic" or community_type == "all": + raise NotImplementedError + + # points to csv file on google drive + df = pd.read_csv( + "https://drive.google.com/uc?id=148fUO7jocoNOBUl2K4bCfjsbd42QxCzX", + dtype=str, + index_col=["dataset", "file"], + ) + + for community_size in community_sizes: + community_size_outdir = os.path.join(dir_path, community_size) + # make a new directory + if not os.path.exists(community_size_outdir): + os.makedirs(community_size_outdir) + + for file_name in file_names: + file_id = df.loc[(community_size, file_name), "file_id"] + file_id_filepath = os.path.join(community_size_outdir, file_name) + url = f"https://drive.google.com/uc?id={file_id}" + + gdown.download(url, file_id_filepath) + + +def main(): + import argparse + import logging as logger + + logger.basicConfig( + format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + level=logger.DEBUG, + ) + + parser = argparse.ArgumentParser( + description="Download a simulated community file from google drive to a specified output directory" + ) + parser.add_argument( + "--community-type", + help="specify synthetic or simulated communities (currently only simulated is available)", + choices=[ + "synthetic", + "simulated", + "all", + ], + required=True, + ) + parser.add_argument( + "--community-sizes", + help="specify a community size to download from", + choices=[ + "78Mbp", + "156Mbp", + "312Mbp", + "625Mbp", + "1250Mbp", + "2500Mbp", + "5000Mbp", + "10000Mbp", + "all", + ], + required=True, + nargs="+", + ) + parser.add_argument( + "--file-names", + help="specify a file name to download", + choices=[ + "README.md", + "reference_assignments.tsv.gz", + "metagenome.fna.gz", + "master.tsv.gz", + "control_reads.tsv.gz", + "control_contigs.tsv.gz", + "unclustered_recruitment.tsv.gz", + "binning.tsv.gz", + "taxonomy.tsv.gz", + "lengths.tsv.gz", + "coverages.tsv.gz", + "gc_content.tsv.gz", + "kmers.embedded.tsv.gz", + "kmers.tsv.gz", + "markers.tsv.gz", + "Bacteria.fna.gz", + "orfs.faa.gz", + "metagenome.filtered.fna.gz", + "hmmscan.tsv.gz", + "forward_reads.fastq.gz", + "reverse_reads.fastq.gz", + "all", + ], + nargs="+", + required=True, + ) + parser.add_argument( + "--dir-path", + help="specify a folder to start the download (several directories will be generated within this folder)", + required=True, + ) + parser.add_argument( + "--host", + help="IP address to ping when checking internet connectivity. Note: Will attempt to connect to port 53 on host address (Default is google.com)", + default="8.8.8.8", + ) + args = parser.parse_args() + + if "all" in args.community_sizes: + community_sizes = ( + "78Mbp", + "156Mbp", + "312Mbp", + "625Mbp", + "1250Mbp", + "2500Mbp", + "5000Mbp", + "10000Mbp", + ) + else: + community_sizes = args.community_sizes + if "all" in args.file_names: + file_names = ( + "README.md", + "reference_assignments.tsv.gz", + "metagenome.fna.gz", + "master.tsv.gz", + "control_reads.tsv.gz", + "control_contigs.tsv.gz", + "unclustered_recruitment.tsv.gz", + "binning.tsv.gz", + "taxonomy.tsv.gz", + "lengths.tsv.gz", + "coverages.tsv.gz", + "gc_content.tsv.gz", + "kmers.embedded.tsv.gz", + "kmers.tsv.gz", + "markers.tsv.gz", + "Bacteria.fna.gz", + "orfs.faa.gz", + "metagenome.filtered.fna.gz", + "hmmscan.tsv.gz", + "forward_reads.fastq.gz", + "reverse_reads.fastq.gz", + ) + else: + file_names = args.file_names + + if not internet_is_connected(host=args.host): + logger.error( + "No internet connection detected (couldn't ping google.com at IP 8.8.8.8). Please confirm connection. Downloader will still attempt to run. (Ping a custom IP address with --host argument)" + ) + + download( + community_type=args.community_type, + community_sizes=community_sizes, + file_names=file_names, + dir_path=args.dir_path, + ) + + +if __name__ == "__main__": + main() diff --git a/docs/source/conf.py b/docs/source/conf.py index bd9d48d07..97e12cfbb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,7 +20,7 @@ autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"] # fmt: off -import parse_argparse +import parse_argparse # -- Project information ----------------------------------------------------- diff --git a/setup.py b/setup.py index 6efd52400..ca52dc499 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ def read(fname): "autometa-binning = autometa.binning.recursive_dbscan:main", "autometa-unclustered-recruitment = autometa.binning.unclustered_recruitment:main", "autometa-binning-summary = autometa.binning.summary:main", + "autometa-download-dataset = autometa.validation.datasets:main", ] }, author="Jason C. Kwan", diff --git a/tests/unit_tests/test_summary.py b/tests/unit_tests/test_summary.py index 81db0e281..14c42cef4 100644 --- a/tests/unit_tests/test_summary.py +++ b/tests/unit_tests/test_summary.py @@ -150,7 +150,8 @@ def return_metabin_taxonomies(*args, **kwargs): @pytest.mark.skip def test_get_metabin_taxonomies( - mock_rank_taxids, bin_df, + mock_rank_taxids, + bin_df, ): mock_ncbi = return_mock_ncbi() df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi) diff --git a/tests/unit_tests/test_vote.py b/tests/unit_tests/test_vote.py index 87c45cdd6..19c9a943e 100644 --- a/tests/unit_tests/test_vote.py +++ b/tests/unit_tests/test_vote.py @@ -117,14 +117,23 @@ def test_add_ranks(ncbi, votes, tmp_path): @pytest.mark.skip def test_vote_assign(blastp, ncbi_dir, prot_orfs, tmp_path): out = tmp_path / "votes.tsv" - votes = vote.assign(out=out, prot_orfs=prot_orfs, blast=blastp, ncbi_dir=ncbi_dir,) + votes = vote.assign( + out=out, + prot_orfs=prot_orfs, + blast=blastp, + ncbi_dir=ncbi_dir, + ) assert isinstance(votes, pd.DataFrame) assert votes.index.name == "contig" assert "taxid" in votes.columns def test_get(ncbi, votes_fpath): - df = vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,) + df = vote.get( + filepath_or_dataframe=votes_fpath, + kingdom="bacteria", + ncbi=ncbi, + ) # canonical ranks should have been added to table if they were not already in place. assert df.shape == (2, 8) @@ -132,7 +141,9 @@ def test_get(ncbi, votes_fpath): def test_get_none_recovered(ncbi, votes_fpath): with pytest.raises(KeyError): vote.get( - filepath_or_dataframe=votes_fpath, kingdom="archaea", ncbi=ncbi, + filepath_or_dataframe=votes_fpath, + kingdom="archaea", + ncbi=ncbi, ) @@ -140,7 +151,9 @@ def test_get_empty_votes(ncbi_dir, tmp_path): fpath = tmp_path / "votes.tsv" with pytest.raises(FileNotFoundError): vote.get( - filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi_dir, + filepath_or_dataframe=fpath, + kingdom="archaea", + ncbi=ncbi_dir, ) @@ -153,13 +166,19 @@ def return_df(*args, **kwargs): monkeypatch.setattr(vote, "add_ranks", return_df, raising=True) with pytest.raises(TableFormatError): vote.get( - filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi, + filepath_or_dataframe=fpath, + kingdom="archaea", + ncbi=ncbi, ) @pytest.fixture(name="ranks_added_votes", scope="module") def fixture_ranks_added_votes(votes_fpath, ncbi): - return vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,) + return vote.get( + filepath_or_dataframe=votes_fpath, + kingdom="bacteria", + ncbi=ncbi, + ) @pytest.mark.parametrize( @@ -217,7 +236,10 @@ def test_write_ranks_no_taxonomy_columns(tmp_path, votes): assembly = dirpath / "assembly.fna" with pytest.raises(KeyError): vote.write_ranks( - taxonomy=votes, assembly=assembly, outdir=dirpath, rank="superkingdom", + taxonomy=votes, + assembly=assembly, + outdir=dirpath, + rank="superkingdom", )