Skip to content

Commit

Permalink
Mergin the changes to create and use a profile for finding ST in a ML…
Browse files Browse the repository at this point in the history
…ST scheme.

Merge branch 'add_profile'
  • Loading branch information
pedrofeijao committed Jul 14, 2017
2 parents d4843c9 + 63f8ab7 commit 73622b4
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 32 deletions.
18 changes: 11 additions & 7 deletions src/MentaLiST.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,14 @@ function parse_commandline()
help = "Kmer size"
required = true
arg_type = Int8
"files"
"-f", "--fasta_files"
nargs = '*'
arg_type = String
help = "Fasta files with the MLST scheme"
required = true
"-p", "--profile"
arg_type = String
help = "Profile file for known genotypes."
end
@add_arg_table s["list_pubmlst"] begin
"-p", "--prefix"
Expand Down Expand Up @@ -134,7 +137,7 @@ end
function call_mlst(args)
include("build_db_functions.jl")
info("Opening kmer database ... ")
kmer_db, loci, loci2alleles, k = open_db(args["db"])
kmer_db, loci, loci2alleles, k, profile = open_db(args["db"])
# 0 votes for all alleles everyone at the start:
votes = Dict(locus_idx => Dict{Int16, Int}(i => 0 for i in 1:length(alleles)) for (locus_idx,alleles) in loci2alleles)
if args["e"] # external kmer counter:
Expand Down Expand Up @@ -174,7 +177,7 @@ function call_mlst(args)
end
end
info("Writing output ...")
write_calls(votes, loci, loci2alleles, args["s"], args["o"])
write_calls(votes, loci, loci2alleles, args["s"], args["o"], profile)
info("Done.")
end

Expand All @@ -185,9 +188,10 @@ end

function download_pubmlst(args)
include("mlst_download_functions.jl")
loci_files = download_pubmlst_scheme(args["scheme"], args["output"])
loci_files, profile_file = download_pubmlst_scheme(args["scheme"], args["output"])
info("Building the k-mer database ...")
args["files"] = loci_files
args["fasta_files"] = loci_files
args["profile"] = profile_file
build_db(args)
end

Expand All @@ -196,13 +200,13 @@ function build_db(args)
k::Int8 = args["k"]

info("Opening FASTA files ... ")
results, loci = kmer_class_for_each_locus(k, args["files"])
results, loci = kmer_class_for_each_locus(k, args["fasta_files"])
# Combine results:
info("Combining results for each locus ...")
kmer_classification = combine_loci_classification(k, results, loci)

info("Saving DB ...")
save_db(k, kmer_classification, loci, args["db"])
save_db(k, kmer_classification, loci, args["db"], args["profile"])
info("Done!")
end

Expand Down
48 changes: 42 additions & 6 deletions src/build_db_functions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ end
# sizeof(alleles_list) = 10382740024 = 9.6 GB
# sizeof(allele_ids_per_locus) = 17769664 = 16 MB.

function save_db(k, kmer_db, loci, filename)
function save_db(k, kmer_db, loci, filename, profile)
loci_list, weight_list, alleles_list, kmer_list, allele_ids_per_locus = kmer_db
d = Dict(
"loci_list"=> Blosc.compress(loci_list),
Expand All @@ -153,9 +153,32 @@ function save_db(k, kmer_db, loci, filename)
d[name] = Blosc.compress(alleles_list[s:e])
end
end
# mkdir:
mkpath(dirname(filename))
# database:
JLD.save("$filename.jld", d)
# Profile:
if profile != nothing
cp(profile, "$filename.profile")
end
end
function open_db(filename)
# profile:
profile = nothing
if isfile("$filename.profile")
types = Array{Int}[]
open("$filename.profile") do f
header = split(readline(f),"\t")
for l in eachline(f)
values = map(x->parse(Int64,x),split(strip(l),"\t"))
# println(values)
push!(types, values)
end
profile = Dict("header"=>header, "types"=>types)
end
end

# Compressed database, open and decompress/decode in memory:
d = JLD.load("$filename.jld")
# alleles_list might be split into smaller parts:
alleles_list = []
Expand Down Expand Up @@ -210,19 +233,32 @@ function open_db(filename)
push!(kmer_classification[kmer], (locus_idx, weight, current_allele_list))
end
end
return kmer_classification, loci, loci2alleles, k
return kmer_classification, loci, loci2alleles, k, profile
end

function write_calls(votes, loci, loci2alleles, sample, filename)
function _find_profile(alleles, profile)
for genotype in profile["types"]
# first element is the type id, all the rest is the actual genotype:
if alleles == genotype[2:end]
return genotype[1]
end
end
return 0
end

function write_calls(votes, loci, loci2alleles, sample, filename, profile)
# get best voted:
best_voted_alleles = [sort(collect(votes[idx]), by=x->-x[2])[1][1] for (idx,locus) in enumerate(loci)]
# translate alleles to original id:
best_voted_alleles = [loci2alleles[locus_idx][best] for (locus_idx, best) in enumerate(best_voted_alleles)]
# check if there is a type on the database:
genotype = _find_profile(best_voted_alleles, profile)

# write:
open(filename, "w") do f
header = join(vcat(["Sample"], loci, ["ClonalComplex"]), "\t")
header = join(vcat(["Sample"], loci, ["ST"]), "\t")
write(f, "$header\n")
# todo: look in the database for the type, to assing a 'ClonalComplex'
calls = join(vcat([sample], best_voted_alleles, ["0"]), "\t")
calls = join(vcat([sample], best_voted_alleles, ["$genotype"]), "\t")
write(f, "$calls\n")
end
# debug votes, find ties:
Expand Down
46 changes: 27 additions & 19 deletions src/mlst_download_functions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,37 @@ function _download_to_folder(url, output_dir, overwrite=false)
return filepath
end

function _find_publmst_species(xml_xroot, target_species)
for species in xml_xroot["species"]
sp_name = _get_first_line(species)
# mlst_path = joinpath(output_dir,sp_name)
if sp_name == target_species
return(species)
end
end
end

function download_pubmlst_scheme(target_species, output_dir, overwrite=false)
loci_files = String[]
xroot = root(_pubmlst_xml())
info("Searching for the scheme ... ")
for species in xroot["species"]
sp_name = _get_first_line(species)
# mlst_path = joinpath(output_dir,sp_name)
if sp_name != target_species
continue
end
info("Downloading ...")
mkpath(output_dir)
db = species["mlst"][1]["database"][1]
profile_url = content(db["profiles"][1]["url"][1])
_download_to_folder(profile_url, output_dir, overwrite)
for locus in db["loci"][1]["locus"]
locus_name = _get_first_line(locus)
info("Downloading locus $locus_name ...")
locus_url = content(locus["url"][1])
filepath = _download_to_folder(locus_url, output_dir)
push!(loci_files, filepath)
end
species = _find_publmst_species(xroot, target_species)
if species == nothing
error("Species not found on pubmlst, please check the spelling and try again.")
return
end
mkpath(output_dir)
db = species["mlst"][1]["database"][1]
info("Downloading profile ...")
profile_url = content(db["profiles"][1]["url"][1])
profile_file = _download_to_folder(profile_url, output_dir, overwrite)
for locus in db["loci"][1]["locus"]
locus_name = _get_first_line(locus)
info("Downloading locus $locus_name ...")
locus_url = content(locus["url"][1])
filepath = _download_to_folder(locus_url, output_dir)
push!(loci_files, filepath)
end
info("Finished downloading.")
return loci_files
return (loci_files, profile_file)
end

0 comments on commit 73622b4

Please sign in to comment.