diff --git a/goatools/anno/dnld_ebi_goa.py b/goatools/anno/dnld_ebi_goa.py index 3024337..3d68ad4 100644 --- a/goatools/anno/dnld_ebi_goa.py +++ b/goatools/anno/dnld_ebi_goa.py @@ -1,61 +1,66 @@ """Download GOA files from the Gene Ontology Annotation (GOA) resource http://www.ebi.ac.uk/GOA.""" -__copyright__ = "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +__copyright__ = ( + "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +) __author__ = "DV Klopfenstein" import os import sys -from goatools.base import dnld_file +from goatools.base import download_file + class DnldGoa: """Download files from the Gene Ontology Annotation (GOA) resource http://www.ebi.ac.uk/GOA.""" # European Bioinformatics Institute (EMBL-EBI) ftp site - ftp_pub = 'ftp://ftp.ebi.ac.uk/pub/' + ftp_pub = "ftp://ftp.ebi.ac.uk/pub/" # Species available from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/ # Example: https://ftp.ebi.ac.uk/pub/databases/GO/goa/CHICKEN/ species = [ - 'arabidopsis', - 'chicken', - 'cow', - 'dicty', - 'dog', - 'fly', - 'human', - 'mouse', + "arabidopsis", + "chicken", + "cow", + "dicty", + "dog", + "fly", + "human", + "mouse", #'pdb', - 'pig', - 'rat', - 'uniprot', - 'worm', - 'yeast', - 'zebrafish', + "pig", + "rat", + "uniprot", + "worm", + "yeast", + "zebrafish", ] - species_items = ['complex', 'isoform', 'rna'] - exts = ['gaf', 'gpa', 'gpi'] + species_items = ["complex", "isoform", "rna"] + exts = ["gaf", "gpa", "gpi"] def __init__(self): - self.ftp_src_goa = os.path.join(self.ftp_pub, 'databases/GO/goa/') + self.ftp_src_goa = os.path.join(self.ftp_pub, "databases/GO/goa/") - def dnld_goa(self, species, ext='gaf', item=None, fileout=None): + def dnld_goa(self, species, ext="gaf", item=None, fileout=None): """Download GOA source file name on EMBL-EBI ftp server.""" basename = self.get_basename(species, ext, item) - src = os.path.join(self.ftp_src_goa, species.upper(), "{F}.gz".format(F=basename)) + src = os.path.join( + self.ftp_src_goa, species.upper(), "{F}.gz".format(F=basename) + ) dst = os.path.join(os.getcwd(), basename) if fileout is None else fileout - dnld_file(src, dst, prt=sys.stdout, loading_bar=None) + download_file(src, dst, prt=sys.stdout, loading_bar=None) return dst - def get_basename(self, species, ext='gaf', item=None): + def get_basename(self, species, ext="gaf", item=None): """Get GOA basename for a specific species. Ex: goa_human.gaf""" assert ext in self.exts, " ".join(self.exts) - if species == 'uniprot': - species = 'uniprot_all' if item != 'gcrp' else 'uniprot_gcrp' + if species == "uniprot": + species = "uniprot_all" if item != "gcrp" else "uniprot_gcrp" if item is None: - return 'goa_{SPECIES}.{EXT}'.format(SPECIES=species, EXT=ext) + return "goa_{SPECIES}.{EXT}".format(SPECIES=species, EXT=ext) assert item in self.species_items - return 'goa_{SPECIES}_{ITEM}.{EXT}'.format(SPECIES=species, ITEM=item, EXT=ext) + return "goa_{SPECIES}_{ITEM}.{EXT}".format(SPECIES=species, ITEM=item, EXT=ext) # Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." diff --git a/goatools/associations.py b/goatools/associations.py index fcf1a3d..fc6b909 100755 --- a/goatools/associations.py +++ b/goatools/associations.py @@ -8,7 +8,7 @@ from collections import defaultdict import os import sys -from goatools.base import dnld_file +from goatools.base import download_file from goatools.base import ftp_get from goatools.anno.factory import get_objanno from goatools.anno.factory import get_anno_desc @@ -19,7 +19,8 @@ from goatools.anno.opts import AnnoOptions from goatools.utils import get_b2aset as utils_get_b2aset -def dnld_assc(assc_name, go2obj=None, namespace='BP', prt=sys.stdout): + +def dnld_assc(assc_name, go2obj=None, namespace="BP", prt=sys.stdout): """Download association from http://geneontology.org/gene-associations.""" # Example assc_name: "tair.gaf" # Download the Association @@ -39,6 +40,7 @@ def dnld_assc(assc_name, go2obj=None, namespace='BP', prt=sys.stdout): assc[gene] = goids_cur.intersection(goids_dag) return assc + def dnld_annotation(assc_file, prt=sys.stdout): """Download gaf, gpad, or gpi from http://current.geneontology.org/annotations/""" if not os.path.isfile(assc_file): @@ -46,31 +48,40 @@ def dnld_annotation(assc_file, prt=sys.stdout): assc_http = "http://current.geneontology.org/annotations/" _, assc_base = os.path.split(assc_file) src = os.path.join(assc_http, "{ASSC}.gz".format(ASSC=assc_base)) - dnld_file(src, assc_file, prt, loading_bar=None) + download_file(src, assc_file, prt, loading_bar=None) -def read_associations(assoc_fn, anno_type='id2gos', namespace='BP', **kws): + +def read_associations(assoc_fn, anno_type="id2gos", namespace="BP", **kws): """Return associatinos in id2gos format""" # kws get_objanno: taxids hdr_only prt allow_missing_symbol obj = get_objanno(assoc_fn, anno_type, **kws) # kws get_id2gos: ev_include ev_exclude keep_ND keep_NOT b_geneid2gos go2geneids return obj.get_id2gos(namespace, **kws) + def get_assoc_ncbi_taxids(taxids, force_dnld=False, loading_bar=True, **kws): """Download NCBI's gene2go. Return annotations for user-specified taxid(s).""" - print('DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader') + print( + "DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader" + ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( - PY=frm.co_filename, FNC=frm.co_name)) - fin = kws['gene2go'] if 'gene2go' in kws else os.path.join(os.getcwd(), "gene2go") + print( + "DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) + fin = kws["gene2go"] if "gene2go" in kws else os.path.join(os.getcwd(), "gene2go") dnld_ncbi_gene_file(fin, force_dnld, loading_bar=loading_bar) return read_ncbi_gene2go(fin, taxids, **kws) + # pylint: disable=unused-argument def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout, loading_bar=True): """Download a file from NCBI Gene's ftp server.""" if not os.path.exists(fin) or force_dnld: import gzip + fin_dir, fin_base = os.path.split(fin) fin_gz = "{F}.gz".format(F=fin_base) fin_gz = os.path.join(fin_dir, fin_gz) @@ -84,68 +95,98 @@ def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout, loading_bar=True) ## wget.download(fin_ftp, bar=loading_bar) ## rsp = wget(fin_ftp) ftp_get(fin_ftp, fin_gz) - with gzip.open(fin_gz, 'rb') as zstrm: + with gzip.open(fin_gz, "rb") as zstrm: if log is not None: log.write("\n READ GZIP: {F}\n".format(F=fin_gz)) - with open(fin, 'wb') as ostrm: + with open(fin, "wb") as ostrm: ostrm.write(zstrm.read()) if log is not None: log.write(" WROTE UNZIPPED: {F}\n".format(F=fin)) + def dnld_annofile(fin_anno, anno_type): """Download annotation file, if needed""" if os.path.exists(fin_anno): return anno_type = get_anno_desc(fin_anno, anno_type) - if anno_type == 'gene2go': + if anno_type == "gene2go": dnld_ncbi_gene_file(fin_anno) - if anno_type in {'gaf', 'gpad'}: + if anno_type in {"gaf", "gpad"}: dnld_annotation(fin_anno) -def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws): + +def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace="BP", **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" - print('DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader') + print( + "DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader" + ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( - PY=frm.co_filename, FNC=frm.co_name)) + print( + "DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) obj = Gene2GoReader(fin_gene2go, taxids=taxids) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True - if 'taxid2asscs' not in kws: + if "taxid2asscs" not in kws: if len(obj.taxid2asscs) == 1: taxid = next(iter(obj.taxid2asscs)) - kws_ncbi = {k:v for k, v in kws.items() if k in AnnoOptions.keys_exp} - kws_ncbi['taxid'] = taxid + kws_ncbi = {k: v for k, v in kws.items() if k in AnnoOptions.keys_exp} + kws_ncbi["taxid"] = taxid return obj.get_id2gos(namespace, **kws_ncbi) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) t2asscs_ret = obj.get_taxid2asscs(taxids, **kws) - t2asscs_usr = kws.get('taxid2asscs', defaultdict(lambda: defaultdict(lambda: defaultdict(set)))) - if 'taxid2asscs' in kws: + t2asscs_usr = kws.get( + "taxid2asscs", defaultdict(lambda: defaultdict(lambda: defaultdict(set))) + ) + if "taxid2asscs" in kws: obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret) return obj.get_id2gos_all(t2asscs_ret) + def get_gaf_hdr(fin_gaf): """Read Gene Association File (GAF). Return GAF version and data info.""" return GafReader(fin_gaf, hdr_only=True).hdr + # pylint: disable=line-too-long -def read_gaf(fin_gaf, prt=sys.stdout, hdr_only=False, namespace='BP', allow_missing_symbol=False, **kws): +def read_gaf( + fin_gaf, + prt=sys.stdout, + hdr_only=False, + namespace="BP", + allow_missing_symbol=False, + **kws +): """Read Gene Association File (GAF). Return data.""" return GafReader( - fin_gaf, hdr_only=hdr_only, prt=prt, allow_missing_symbol=allow_missing_symbol, godag=kws.get('godag')).get_id2gos( - namespace, **kws) + fin_gaf, + hdr_only=hdr_only, + prt=prt, + allow_missing_symbol=allow_missing_symbol, + godag=kws.get("godag"), + ).get_id2gos(namespace, **kws) + def get_b2aset(a2bset): """Given gene2gos, return go2genes. Given go2genes, return gene2gos.""" - print('DEPRECATED get_b2aset MOVED: USE get_b2aset IN goatools.utils') + print("DEPRECATED get_b2aset MOVED: USE get_b2aset IN goatools.utils") # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED get_b2aset CALLED FROM: {PY} BY {FNC}'.format(PY=frm.co_filename, FNC=frm.co_name)) + print( + "DEPRECATED get_b2aset CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) return utils_get_b2aset(a2bset) -def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout): + +def get_assc_pruned( + assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout +): """Remove GO IDs associated with large numbers of genes. Used in stochastic simulations.""" # DEFN WAS: get_assc_pruned(assc_geneid2gos, max_genecnt=None, prt=sys.stdout): # ADDED min_genecnt argument and functionality @@ -156,22 +197,27 @@ def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys go2genes_prun = {} for goid, genes in go2genes_orig.items(): num_genes = len(genes) - if (min_genecnt is None or num_genes >= min_genecnt) and \ - (max_genecnt is None or num_genes <= max_genecnt): + if (min_genecnt is None or num_genes >= min_genecnt) and ( + max_genecnt is None or num_genes <= max_genecnt + ): go2genes_prun[goid] = genes num_was = len(go2genes_orig) num_now = len(go2genes_prun) gos_rm = set(go2genes_orig.keys()).difference(set(go2genes_prun.keys())) - assert num_was-num_now == len(gos_rm) + assert num_was - num_now == len(gos_rm) if prt is not None: if min_genecnt is None: min_genecnt = 1 if max_genecnt is None: max_genecnt = "Max" - prt.write("{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format( - m=min_genecnt, M=max_genecnt, N=num_was-num_now, NOW=num_now)) + prt.write( + "{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format( + m=min_genecnt, M=max_genecnt, N=num_was - num_now, NOW=num_now + ) + ) return utils_get_b2aset(go2genes_prun), gos_rm + def read_annotations(**kws): """Read annotations from either a GAF file or NCBI's gene2go file.""" # Read and save annotation lines @@ -179,6 +225,7 @@ def read_annotations(**kws): # Return associations return objanno.get_id2gos(**kws) if objanno is not None else {} + def get_tcntobj(go2obj, **kws): """Return a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gpad gaf gene2go id2gos diff --git a/goatools/base.py b/goatools/base.py index d7df852..7e3b3ce 100644 --- a/goatools/base.py +++ b/goatools/base.py @@ -1,102 +1,38 @@ """Utilities used in Gene Ontology Enrichment Analyses.""" # Stolen from brentp +import gzip +import logging import os -from os.path import isfile -import os.path as op import sys -import bz2 -import gzip -import urllib +import traceback +import zlib + from ftplib import FTP +from os.path import isfile + import requests +from rich.logging import RichHandler -if sys.version_info[0] < 3: - int_types = (int, long) - urlopen = urllib.urlopen -else: - int_types = (int,) - basestring = str - from urllib.request import urlopen - - -def nopen(f, mode="r"): - r""" - open a file that's gzipped or return stdin for '-' - if f is a number, the result of nopen(sys.argv[f]) is returned. - >>> nopen('-') == sys.stdin, nopen('-', 'w') == sys.stdout - (True, True) - >>> nopen(sys.argv[0]) - <...file...> - # expands user and vars ($HOME) - >>> nopen("~/.bashrc").name == nopen("$HOME/.bashrc").name - True - # an already open file. - >>> nopen(open(sys.argv[0])) - <...file...> - >>> nopen(0) - <...file...> - Or provide nicer access to Popen.stdout - >>> files = list(nopen("|ls")) - >>> assert 'setup.py\n' in files or b'setup.py\n' in files, files - """ - if isinstance(f, int_types): - return nopen(sys.argv[f], mode) - - if not isinstance(f, basestring): - return f - if f.startswith("|"): - # using shell explicitly makes things like process substitution work: - # http://stackoverflow.com/questions/7407667/python-subprocess-subshells-and-redirection - # use sys.stderr so we dont have to worry about checking it... - p = Popen(f[1:], stdout=PIPE, stdin=PIPE, - stderr=sys.stderr if mode == "r" else PIPE, - shell=True, bufsize=-1, # use system default for buffering - preexec_fn=prefunc, - close_fds=False, executable=os.environ.get('SHELL')) - if sys.version_info[0] > 2: - import io - p.stdout = io.TextIOWrapper(p.stdout) - p.stdin = io.TextIOWrapper(p.stdin) - if mode != "r": - p.stderr = io.TextIOWrapper(p.stderr) - - if mode and mode[0] == "r": - return process_iter(p, f[1:]) - return p - - if f.startswith(("http://", "https://", "ftp://")): - fh = urlopen(f) - if f.endswith(".gz"): - return ungzipper(fh) - if sys.version_info[0] < 3: - return fh - import io - return io.TextIOWrapper(fh) - f = op.expanduser(op.expandvars(f)) - if f.endswith((".gz", ".Z", ".z")): - fh = gzip.open(f, mode) - if sys.version_info[0] < 3: - return fh - import io - return io.TextIOWrapper(fh) - elif f.endswith((".bz", ".bz2", ".bzip2")): - fh = bz2.BZ2File(f, mode) - if sys.version_info[0] < 3: - return fh - import io - return io.TextIOWrapper(fh) - - return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" \ - else open(f, mode) + +def get_logger(name): + logger = logging.getLogger(name) + if logger.hasHandlers(): + logger.handlers.clear() + logger.addHandler(RichHandler()) + logger.propagate = False + logger.setLevel(logging.INFO) + return logger + + +logger = get_logger("sc_tools") def ungzipper(fh, blocksize=16384): """ work-around to get streaming download of http://.../some.gz """ - import zlib uzip = zlib.decompressobj(16 + zlib.MAX_WBITS) data = uzip.decompress(fh.read(blocksize)).split("\n") @@ -110,7 +46,7 @@ def ungzipper(fh, blocksize=16384): data[0] = save + data[0] -def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout, loading_bar=True): +def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout): """Download Ontologies, if necessary.""" if not isfile(obo): http = "http://purl.obolibrary.org/obo/go" @@ -118,25 +54,28 @@ def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout, loading_bar=True): http = "http://www.geneontology.org/ontology/subsets" # http = 'http://current.geneontology.org/ontology/subsets' obo_remote = "{HTTP}/{OBO}".format(HTTP=http, OBO=os.path.basename(obo)) - dnld_file(obo_remote, obo, prt, loading_bar) + download_file(obo_remote, obo, prt) else: if prt: prt.write(" EXISTS: {FILE}\n".format(FILE=obo)) return obo -def download_ncbi_associations(gene2go="gene2go", prt=sys.stdout, loading_bar=True): + +def download_ncbi_associations(gene2go="gene2go", prt=sys.stdout): """Download associations from NCBI, if necessary""" # Download: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz gzip_file = "{GENE2GO}.gz".format(GENE2GO=gene2go) if not isfile(gene2go): file_remote = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/{GZ}".format( - GZ=os.path.basename(gzip_file)) - dnld_file(file_remote, gene2go, prt, loading_bar) + GZ=os.path.basename(gzip_file) + ) + download_file(file_remote, gene2go, prt) else: if prt is not None: prt.write(" EXISTS: {FILE}\n".format(FILE=gene2go)) return gene2go + def gunzip(gzip_file, file_gunzip=None): """Unzip .gz file. Return filename of unzipped file.""" if file_gunzip is None: @@ -144,17 +83,21 @@ def gunzip(gzip_file, file_gunzip=None): gzip_open_to(gzip_file, file_gunzip) return file_gunzip -def get_godag(fin_obo="go-basic.obo", prt=sys.stdout, loading_bar=True, optional_attrs=None): + +def get_godag(fin_obo="go-basic.obo", prt=sys.stdout, optional_attrs=None): """Return GODag object. Initialize, if necessary.""" - from goatools.obo_parser import GODag - download_go_basic_obo(fin_obo, prt, loading_bar) + from .obo_parser import GODag + + download_go_basic_obo(fin_obo, prt) return GODag(fin_obo, optional_attrs, load_obsolete=False, prt=prt) -def dnld_gaf(species_txt, prt=sys.stdout, loading_bar=True): + +def download_gaf(species_txt, prt=sys.stdout): """Download GAF file if necessary.""" - return dnld_gafs([species_txt], prt, loading_bar)[0] + return download_gafs([species_txt], prt)[0] -def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): + +def download_gafs(species_list, prt=sys.stdout): """Download GAF files if necessary.""" # Example GAF files in http://current.geneontology.org/annotations/: # http://current.geneontology.org/annotations/mgi.gaf.gz @@ -164,58 +107,62 @@ def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): # There are two filename patterns for gene associations on geneontology.org fin_gafs = [] cwd = os.getcwd() - for species_txt in species_list: # e.g., goa_human mgi fb - gaf_base = '{ABC}.gaf'.format(ABC=species_txt) # goa_human.gaf - gaf_cwd = os.path.join(cwd, gaf_base) # {CWD}/goa_human.gaf + for species_txt in species_list: # e.g., goa_human mgi fb + gaf_base = "{ABC}.gaf".format(ABC=species_txt) # goa_human.gaf + gaf_cwd = os.path.join(cwd, gaf_base) # {CWD}/goa_human.gaf remove_filename = "{HTTP}/{GAF}.gz".format(HTTP=http, GAF=gaf_base) - dnld_file(remove_filename, gaf_cwd, prt, loading_bar) + download_file(remove_filename, gaf_cwd, prt) fin_gafs.append(gaf_cwd) return fin_gafs + def http_get(url, fout=None): """Download a file from http. Save it in a file named by fout""" - print('requests.get({URL}, stream=True)'.format(URL=url)) + print("requests.get({URL}, stream=True)".format(URL=url)) rsp = requests.get(url, stream=True) if rsp.status_code == 200 and fout is not None: - with open(fout, 'wb') as prt: + with open(fout, "wb") as prt: for chunk in rsp: # .iter_content(chunk_size=128): prt.write(chunk) - print(' WROTE: {F}\n'.format(F=fout)) + print(" WROTE: {F}\n".format(F=fout)) else: print(rsp.status_code, rsp.reason, url) print(rsp.content) return rsp + def ftp_get(fin_src, fout): """Download a file from an ftp server""" - assert fin_src[:6] == 'ftp://', fin_src + assert fin_src[:6] == "ftp://", fin_src dir_full, fin_ftp = os.path.split(fin_src[6:]) - pt0 = dir_full.find('/') + pt0 = dir_full.find("/") assert pt0 != -1, pt0 ftphost = dir_full[:pt0] - chg_dir = dir_full[pt0+1:] - print('FTP RETR {HOST} {DIR} {SRC} -> {DST}'.format( - HOST=ftphost, DIR=chg_dir, SRC=fin_ftp, DST=fout)) + chg_dir = dir_full[pt0 + 1 :] + print( + "FTP RETR {HOST} {DIR} {SRC} -> {DST}".format( + HOST=ftphost, DIR=chg_dir, SRC=fin_ftp, DST=fout + ) + ) ftp = FTP(ftphost) # connect to host, default port ftp.ncbi.nlm.nih.gov - ftp.login() # user anonymous, passwd anonymous@ - ftp.cwd(chg_dir) # change into "debian" directory gene/DATA - cmd = 'RETR {F}'.format(F=fin_ftp) # gene2go.gz - ftp.retrbinary(cmd, open(fout, 'wb').write) # /usr/home/gene2go.gz + ftp.login() # user anonymous, passwd anonymous@ + ftp.cwd(chg_dir) # change into "debian" directory gene/DATA + cmd = "RETR {F}".format(F=fin_ftp) # gene2go.gz + ftp.retrbinary(cmd, open(fout, "wb").write) # /usr/home/gene2go.gz ftp.quit() -def dnld_file(src_ftp, dst_file, prt=sys.stdout, loading_bar=True): +def download_file(src_ftp, dst_file, prt=sys.stdout): """Download specified file if necessary.""" if isfile(dst_file): return - do_gunzip = src_ftp[-3:] == '.gz' and dst_file[-3:] != '.gz' + do_gunzip = src_ftp[-3:] == ".gz" and dst_file[-3:] != ".gz" dst_gz = "{DST}.gz".format(DST=dst_file) if do_gunzip else dst_file # Write to stderr, not stdout so this message will be seen when running nosetests cmd_msg = "get({SRC} out={DST})\n".format(SRC=src_ftp, DST=dst_gz) try: - print('$ get {SRC}'.format(SRC=src_ftp)) - #### wget.download(src_ftp, out=dst_gz, bar=loading_bar) - if src_ftp[:4] == 'http': + print("$ get {SRC}".format(SRC=src_ftp)) + if src_ftp[:4] == "http": http_get(src_ftp, dst_gz) else: ftp_get(src_ftp, dst_gz) @@ -224,22 +171,21 @@ def dnld_file(src_ftp, dst_file, prt=sys.stdout, loading_bar=True): prt.write("$ gunzip {FILE}\n".format(FILE=dst_gz)) gzip_open_to(dst_gz, dst_file) except IOError as errmsg: - import traceback traceback.print_exc() sys.stderr.write("**FATAL cmd: {CMD}".format(CMD=cmd_msg)) sys.stderr.write("**FATAL msg: {ERR}".format(ERR=str(errmsg))) sys.exit(1) + def gzip_open_to(fin_gz, fout): """Unzip a file.gz file.""" try: - with gzip.open(fin_gz, 'rb') as zstrm: - with open(fout, 'wb') as ostrm: + with gzip.open(fin_gz, "rb") as zstrm: + with open(fout, "wb") as ostrm: ostrm.write(zstrm.read()) # pylint: disable=broad-except except Exception as errmsg: print("**ERROR: COULD NOT GUNZIP({G}) TO FILE({F})".format(G=fin_gz, F=fout)) - import traceback traceback.print_exc() sys.stderr.write("**FATAL msg: {ERR}".format(ERR=str(errmsg))) sys.exit(1) diff --git a/tests/test_annotations_gaf.py b/tests/test_annotations_gaf.py index 320354b..ff4a2a9 100755 --- a/tests/test_annotations_gaf.py +++ b/tests/test_annotations_gaf.py @@ -12,22 +12,22 @@ from collections import defaultdict from goatools.associations import read_gaf -from goatools.base import dnld_gafs +from goatools.base import download_gafs def test_gaf_read_goa_human(): """Get associations for human(9606).""" - _test_gaf_read_species(['goa_human']) + _test_gaf_read_species(["goa_human"]) def test_gaf_read_mgi(): """Get associations for mouse(10090).""" - _test_gaf_read_species(['mgi']) + _test_gaf_read_species(["mgi"]) def test_gaf_read_fb(): """Get associations for fly(7227).""" - _test_gaf_read_species(['fb']) + _test_gaf_read_species(["fb"]) def _test_gaf_read_species(species_ids, log=sys.stdout): @@ -37,33 +37,40 @@ def _test_gaf_read_species(species_ids, log=sys.stdout): _test_gaf_read(msg, species_ids, None, log) # Read GAF associations msg = "Read GAF associations; keepif is default in goatools.associations.read_gaf" - keepif = lambda nt: 'NOT' not in nt.Qualifier and nt.Evidence_Code != 'ND' + keepif = lambda nt: "NOT" not in nt.Qualifier and nt.Evidence_Code != "ND" _test_gaf_read(msg, species_ids, keepif, log) # Read GAF associations, allowing ND Evidence codes msg = "Read GAF associations; Allow ND Evidence codes" - keepif = lambda nt: 'NOT' not in nt.Qualifier + keepif = lambda nt: "NOT" not in nt.Qualifier _test_gaf_read(msg, species_ids, keepif, log) # Read GAF associations, allowing ND entries and NOT Qualifiers msg = "Read GAF associations; Allow ND Evidence codes and NOT Qualifiers" keepif = lambda nt: True - #_test_gaf_read(msg, species_ids, keepif, log) + # _test_gaf_read(msg, species_ids, keepif, log) # Limit number of tests for speed _test_gaf_read(msg, species_ids[-1:], keepif, log) + def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout): # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) local_dir = os.path.dirname(os.path.abspath(__file__)) - for fin_gaf in dnld_gafs(species_ids, loading_bar=None): + for fin_gaf in download_gafs(species_ids, loading_bar=None): fin_gaf = os.path.join(local_dir, fin_gaf) log.write("\n") id2gos_bp = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif) - id2gos_all = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif, namespace='all') + id2gos_all = read_gaf( + fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif, namespace="all" + ) assert len(id2gos_all) > len(id2gos_bp) if "mgi.gaf" in fin_gaf: _chk_key(id2gos_bp, "MGI:") - log.write(" {N:>6,} IDs found in BP {F}\n".format(N=len(id2gos_bp), F=fin_gaf)) - log.write(" {N:>6,} IDs found in ALL {F}\n".format(N=len(id2gos_all), F=fin_gaf)) + log.write( + " {N:>6,} IDs found in BP {F}\n".format(N=len(id2gos_bp), F=fin_gaf) + ) + log.write( + " {N:>6,} IDs found in ALL {F}\n".format(N=len(id2gos_all), F=fin_gaf) + ) go2ids = read_gaf(fin_gaf, go2geneids=True, keepif=keepif) _chk_key(go2ids, "GO:") log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) @@ -71,22 +78,25 @@ def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout): log.write("\n{MSG}\n".format(MSG=msg)) txtpat = " {N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n" for taxid, asscs in taxid2asscs.items(): - num_gene2gos = len(asscs.get('ID2GOs')) - num_go2genes = len(asscs.get('GO2IDs')) + num_gene2gos = len(asscs.get("ID2GOs")) + num_go2genes = len(asscs.get("GO2IDs")) log.write(txtpat.format(TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000 + def _chk_key(a2bs, pattern): """Confirm format of dictionary key.""" for key in a2bs.keys(): if pattern in key: return - raise RuntimeError("PATTERN({P}) NOT FOUND IN KEY({K})".format( - P=pattern, K=key)) + raise RuntimeError( + "PATTERN({P}) NOT FOUND IN KEY({K})".format(P=pattern, K=key) + ) + -if __name__ == '__main__': +if __name__ == "__main__": test_gaf_read_fb() diff --git a/tests/test_gosubdag_tcntobj.py b/tests/test_gosubdag_tcntobj.py index 37c7009..632c876 100755 --- a/tests/test_gosubdag_tcntobj.py +++ b/tests/test_gosubdag_tcntobj.py @@ -7,7 +7,7 @@ import os import sys from goatools.base import get_godag -from goatools.base import dnld_gaf +from goatools.base import download_gaf from goatools.associations import read_gaf from goatools.semantic import TermCounts from goatools.gosubdag.gosubdag import GoSubDag @@ -16,6 +16,7 @@ REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") + # TBD MOVE TO GOATOOLS TEST PKG class Run(object): """Objects for running plotting test.""" @@ -26,14 +27,16 @@ def __init__(self, obo, gaf, prt): # Gene Ontologies self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo)) # Annotations - #_file_gaf = dnld_gaf(os.path.join(REPO, gaf)) - _file_gaf = dnld_gaf(gaf) + # _file_gaf = dnld_gaf(os.path.join(REPO, gaf)) + _file_gaf = download_gaf(gaf) print("GAF: {GAF}\n".format(GAF=_file_gaf)) self.gene2gos = read_gaf(_file_gaf) self.tcntobj = TermCounts(self.go2obj_all, self.gene2gos) # GoSubDag - self.gosubdag_all = GoSubDag(None, self.go2obj_all, tcntobj=self.tcntobj, prt=prt) - self.prtfmt = self.gosubdag_all.prt_attr['fmta'] + self.gosubdag_all = GoSubDag( + None, self.go2obj_all, tcntobj=self.tcntobj, prt=prt + ) + self.prtfmt = self.gosubdag_all.prt_attr["fmta"] def prt_goids_all(self, prt): """Print all GO IDs, including alternate GO IDs, in GODag.""" @@ -42,11 +45,15 @@ def prt_goids_all(self, prt): def plt_goids(self, fout_img, go_sources): """Plot GO IDs.""" # % src/bin/go_plot.py GOs --obo=../goatools/data/i86.obo --outfile=t00.jpg --mark_alt_id - gosubdag = GoSubDag(go_sources, self.gosubdag_all.go2obj, prt=self.prt, - # rcntobj=False, - rcntobj=self.gosubdag_all.rcntobj, - go2nt=self.gosubdag_all.go2nt) - prtfmt = gosubdag.prt_attr['fmta'] + gosubdag = GoSubDag( + go_sources, + self.gosubdag_all.go2obj, + prt=self.prt, + # rcntobj=False, + rcntobj=self.gosubdag_all.rcntobj, + go2nt=self.gosubdag_all.go2nt, + ) + prtfmt = gosubdag.prt_attr["fmta"] goids_plt = GoSubDagPlot(gosubdag).get_goids_plt() self.prt.write("\n{N} GO IDs\n".format(N=len(goids_plt))) gosubdag.prt_goids(goids_plt, prtfmt=prtfmt, prt=self.prt) @@ -54,18 +61,21 @@ def plt_goids(self, fout_img, go_sources): objplt.plt_dag(os.path.join(self.cwd, fout_img)) - def test_plotgosubdag(prt=sys.stdout): """Test plotting of various GoSubDag options.""" objrun = Run("data/i86.obo", "goa_human", prt) # objrun.prt_goids_all(prt) - go_sources = set([ - 'GO:0000004', # a BP 15 L00 D00 biological_process - 'GO:0008151', # a BP 10 L01 D01 B cellular process - 'GO:0007516', # BP 0 L04 D05 ABC hemocyte development - 'GO:0036476']) # BP 0 L06 D06 AB neuron death in response to hydrogen peroxide + go_sources = set( + [ + "GO:0000004", # a BP 15 L00 D00 biological_process + "GO:0008151", # a BP 10 L01 D01 B cellular process + "GO:0007516", # BP 0 L04 D05 ABC hemocyte development + "GO:0036476", + ] + ) # BP 0 L06 D06 AB neuron death in response to hydrogen peroxide objrun.plt_goids("test_gosubdag_tcntobj.png", go_sources) + # kws_exp = [ # ({}, {'rcntobj':rcntobj}), # ({'rcntobj':None}, {'rcntobj':None}), @@ -111,7 +121,7 @@ def test_plotgosubdag(prt=sys.stdout): # # # objplt.plt_dag(fout_img) -if __name__ == '__main__': +if __name__ == "__main__": test_plotgosubdag() # Copyright (C) 2016-2017, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_read_gaf_allow_nd.py b/tests/test_read_gaf_allow_nd.py index 3248c4b..705da6f 100755 --- a/tests/test_read_gaf_allow_nd.py +++ b/tests/test_read_gaf_allow_nd.py @@ -6,7 +6,8 @@ import sys from goatools.associations import read_gaf -from goatools.base import dnld_gaf +from goatools.base import download_gaf + def test_gaf_read(log=sys.stdout): """Return GO associations from a GAF file. Download if necessary.""" @@ -18,7 +19,7 @@ def test_gaf_read(log=sys.stdout): # 639 GO:0008150 ND # Example species_ids: goa_human mgi fb - fin_gaf = dnld_gaf('goa_human', loading_bar=None) + fin_gaf = download_gaf("goa_human", loading_bar=None) # Example 1: Read GAF go2ids = read_gaf(fin_gaf, go2geneids=True) @@ -27,8 +28,11 @@ def test_gaf_read(log=sys.stdout): # Example 2: Read GAF using defaults (No NOT Qualifiers and no ND Evidence Codes) go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=False, keep_NOT=False) - log.write("Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format( - N=len(go2ids))) + log.write( + "Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format( + N=len(go2ids) + ) + ) # Example 3: Read GAF allowing GOs with ND Evidence Codes go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True) @@ -36,10 +40,14 @@ def test_gaf_read(log=sys.stdout): # Example 4: Read GAF allowing all GOs, even those with NOT Qualifiers or ND Evidence Codes go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True, keep_NOT=True) - log.write("Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format(N=len(go2ids))) + log.write( + "Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format( + N=len(go2ids) + ) + ) -if __name__ == '__main__': +if __name__ == "__main__": test_gaf_read() # Copyright (C) 2016-2019, DV Klopfenstein, H Tang. All rights reserved.