Skip to content

Commit

Permalink
new functionality: download enterobase mlst schemes
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrofeijao committed Nov 21, 2017
1 parent d4df9f5 commit 890e0fb
Show file tree
Hide file tree
Showing 6 changed files with 72,850 additions and 2 deletions.
27,523 changes: 27,523 additions & 0 deletions scripts/ESCwgMLST.txt

Large diffs are not rendered by default.

24,075 changes: 24,075 additions & 0 deletions scripts/SALwgMLST.txt

Large diffs are not rendered by default.

21,099 changes: 21,099 additions & 0 deletions scripts/YERwgMLST.txt

Large diffs are not rendered by default.

69 changes: 69 additions & 0 deletions scripts/download_enterobase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python
import logging
logger = logging.getLogger()

import argparse
import sys
import os
import urllib
import gzip
import shutil
from multiprocessing import Pool

if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Downloads and unzips all FASTA files from a specific enterobase MLST specific. ")
parser.add_argument("-o", "--output", type=str, required=True, help="Output folder for FASTA files.")
parser.add_argument("-s", "--species", type=str, required=True, choices=["S","Y","E"], help="Choose the target scheme: (S)almonella, (Y)ersinia, or (E)scherichia/Shigella.")
parser.add_argument("-y", "--type", type=str, required=True, choices=["cg","wg"], help="Choose the MLST scheme type, cgMLST (cg) or wgMLST (wg).")
parser.add_argument("-t", "--threads", type=int, default=4, help="Number of threads, to download in parallel. Use responsibly.")
# parser.add_argument("files", nargs="+", help="Fasta files")
parser.add_argument('-ll', '--loglevel', type=str, default="INFO", choices=['DEBUG','INFO','WARNING','ERROR','CRITICAL'], help='Set the logging level')
param = parser.parse_args()
logging.basicConfig(level=param.loglevel, format='%(asctime)s (%(relativeCreated)d ms) -> %(levelname)s:%(message)s', datefmt='%I:%M:%S %p')

basefolder = os.path.dirname(sys.argv[0])
# Read the file for the corresponding scheme
sp = {"E":"ESC", "S":"SAL", "Y":"YER"}
verbose = {"S":"Salmonella", "Y":"Yersinia", "E":"Escherichia/Shigella"}
sp_code = "%swgMLST" % sp[param.species]
tp_code = "%sMLSTv1" % param.type
filename = os.path.join(basefolder, "%s.txt" % sp_code)

def download_and_gunzip_locus(locus):
fasta_file = os.path.join(param.output, "%s.fa" % locus)
gzip_file = os.path.join(param.output, "%s.fa.gz" % locus)
if os.path.isfile(fasta_file):
logger.info("File for locus %s already exists, skipping ..." % locus)
return
logger.info("Downloading locus %s" % locus)
url = "http://enterobase.warwick.ac.uk/download_data?species=%s&scheme=%s&allele=%s" % (sp_code, tp_code, locus)
if not os.path.isfile(gzip_file):
urllib.urlretrieve(url, gzip_file)
with gzip.open(gzip_file, 'rb') as f_in, open(fasta_file, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(gzip_file)

all_loci = []
with open(filename) as f:
all_loci = [obj[1].split(" ")[0] for obj in [l.strip().split("\t") for l in f if l] if obj[0].startswith(param.type)]

logging.info("Downloading %d FASTA files for %s %sMLST scheme, it might take a while ..." % (len(all_loci), verbose[param.species], param.type))
if not os.path.isdir(param.output):
os.makedirs(param.output)

# download all:
if param.threads > 1:
# process in parallel:
p = Pool(param.threads)
p.map(download_and_gunzip_locus, all_loci)
else:
# serial, useful for debugging
for locus in all_loci:
download_and_gunzip_locus(locus)



# locus = l.strip()

# sys.exit()
40 changes: 40 additions & 0 deletions src/MentaLiST.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ function parse_commandline()
"download_cgmlst"
help = "Dowload a MLST scheme from cgmlst.org and build a MLST k-mer database."
action = :command
"download_enterobase"
help = "Dowload a MLST scheme from Enterobase (enterobase.warwick.ac.uk) and build a MLST k-mer database."
action = :command

end
# Calling MLST options:
Expand Down Expand Up @@ -127,6 +130,32 @@ function parse_commandline()
action = :store_true
end

@add_arg_table s["download_enterobase"] begin
"-o", "--output"
help = "Output folder for the scheme files."
arg_type = String
required = true
"-s", "--scheme"
help = "Letter identifying which scheme: (S)almonella, (Y)ersinia, or (E)scherichia/Shigella."
arg_type = String
required = true
"-t", "--type"
help = "Choose the type: 'cg' or 'wg' for cgMLST or wgMLST scheme, respectively."
arg_type = String
required = true
"-k"
help = "K-mer size"
required = true
arg_type = Int8
"--db"
help = "Output file for the kmer database."
arg_type = String
required = true
"-c", "--disable_compression"
help = "Disables the default compression of the database, that stores only the most informative kmers. Not recommended unless for debugging."
action = :store_true
end

return parse_args(s)
end

Expand Down Expand Up @@ -173,6 +202,15 @@ function download_cgmlst(args)
build_db(args)
end

function download_enterobase(args)
include("mlst_download_functions.jl")
loci_files = download_enterobase_scheme(args["scheme"], args["type"], args["output"])
info("Building the k-mer database ...")
args["fasta_files"] = loci_files
args["profile"] = nothing
build_db(args)
end

function build_db(args)
include("build_db_functions.jl")
check_files(args["fasta_files"])
Expand Down Expand Up @@ -204,6 +242,8 @@ function main()
list_cgmlst(args["list_cgmlst"])
elseif args["%COMMAND%"] == "download_cgmlst"
download_cgmlst(args["download_cgmlst"])
elseif args["%COMMAND%"] == "download_enterobase"
download_enterobase(args["download_enterobase"])
end
end

Expand Down
46 changes: 44 additions & 2 deletions src/mlst_download_functions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ function list_pubmlst_schema(prefix)
end


function _download_to_folder(url, output_dir, overwrite=false)
filepath = joinpath(output_dir, basename(url))
function _download_to_folder(url, output_dir, overwrite=false, filename=nothing)
filepath = joinpath(output_dir, filename == nothing ? basename(url) : filename)
if overwrite || (!isfile(filepath) || _older_than_a_day(filepath))
mkpath(output_dir)
download(url, filepath)
Expand Down Expand Up @@ -131,6 +131,48 @@ function _find_cgmlst_id(target_id)
return nothing
end

function download_enterobase_scheme(scheme, s_type, output_dir, overwrite=false)
sp = Dict("E"=>"ESC", "S"=>"SAL", "Y"=>"YER")
if !haskey(sp, scheme)
info("Scheme has to be E, S, or Y.")
exit(-1)
end
verbose = Dict("S"=>"Salmonella", "Y"=>"Yersinia", "E"=>"Escherichia/Shigella")
sp_code = "$(sp[scheme])wgMLST"
tp_code = "$(s_type)MLSTv1"
filename = joinpath(dirname(@__FILE__), "../scripts/$(sp_code).txt")


loci = String[]
open(filename) do f
for l in eachline(f)
values = split(strip(l),"\t")
if startswith(values[1], s_type) # add locus if it is cg or wgmlst, according to input option
push!(loci, split(values[2],' ')[1])
end
end
end
loci_files = String[]
for locus in loci
fasta_locus = joinpath(output_dir, "$locus.fa")
push!(loci_files, fasta_locus)
if isfile(fasta_locus)
continue
end
gzip_locus = _download_to_folder("http://enterobase.warwick.ac.uk/download_data?species=$sp_code&scheme=$tp_code&allele=$locus", output_dir, false, "$locus.fa.gz")
# gunzip to a FASTA and remove the gzip file;
f_in = GZip.open(gzip_locus)
f_out = open(fasta_locus, "w")
while !eof(f_in)
write(f_out, readline(f_in))
end
close(f_in)
close(f_out)
rm(gzip_locus)
end
return loci_files
end

function download_cgmlst_scheme(target_id, output_dir, overwrite=false)
id = _find_cgmlst_id(target_id)
if id == nothing
Expand Down

0 comments on commit 890e0fb

Please sign in to comment.