Skip to content

Commit

Permalink
newdb related (#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoc1 authored Jun 29, 2022
1 parent 309f45d commit cafe7d6
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 28 deletions.
2 changes: 1 addition & 1 deletion midas2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
import sys
assert sys.version_info >= (3, 7), "Python version >= 3.7 is required."

version = "1.0.1"
version = "1.0.2"
56 changes: 30 additions & 26 deletions midas2/models/midasdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,48 +174,52 @@ def fetch_tarball(self, filename, list_of_species):
_fetched_files = [_fetch_file_from_s3(args_list[0])]

fetched_files = dict(zip(list_of_species, _fetched_files))
for species_id in list_of_species:
fetched_filenames = tarball_mapping[filename]
for _filename in fetched_filenames:
genome_id = self.get_repgenome_id(species_id)
_fetched_file = self.get_target_layout(_filename, False, species_id, genome_id)
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][species_id][_filename]
assert md5_fetched == md5_lookup, f"Error for downloading {_fetched_file} from {filename}. Please delete the folder and redownload."
if self.has_md5sum:
for species_id in list_of_species:
fetched_filenames = tarball_mapping[filename]
for _filename in fetched_filenames:
genome_id = self.get_repgenome_id(species_id)
_fetched_file = self.get_target_layout(_filename, False, species_id, genome_id)
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][species_id][_filename]
assert md5_fetched == md5_lookup, f"Error for downloading {_fetched_file} from {filename}. Please delete the folder and redownload."
return fetched_files

# Single Copy Marker Genes DB
if filename == "markerdb":
fetched_dir = _fetch_file_from_s3(self.construct_file_tuple(filename))
fetched_files = self.get_target_layout("marker_db", False)
fetched_files = dict(zip(MARKER_FILE_EXTS, fetched_files))
for _ in MARKER_FILE_EXTS:
_fetched_file = fetched_files[_]
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][_]
assert md5_fetched == md5_lookup, f"Error for downloadding {_fetched_file} from {filename}. Please delete the folder and redownload."
if self.has_md5sum:
for _ in MARKER_FILE_EXTS:
_fetched_file = fetched_files[_]
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][_]
assert md5_fetched == md5_lookup, f"Error for downloadding {_fetched_file} from {filename}. Please delete the folder and redownload."
return {filename: fetched_dir}

if filename == "markerdb_models":
fetched_dir = _fetch_file_from_s3(self.construct_file_tuple(filename))
for _ in tarball_mapping[filename]:
_fetched_file = self.get_target_layout(f"marker_db_{_}", False)
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][_]
assert md5_fetched == md5_lookup, f"Error for downloadding {_fetched_file} from {filename}. Please delete the folder and redownload."
if self.has_md5sum:
for _ in tarball_mapping[filename]:
_fetched_file = self.get_target_layout(f"marker_db_{_}", False)
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][_]
assert md5_fetched == md5_lookup, f"Error for downloadding {_fetched_file} from {filename}. Please delete the folder and redownload."
return {filename: fetched_dir}

# Chunks
if filename == "chunks":
fetched_dir = _fetch_file_from_s3(self.construct_file_tuple(filename))
fetched_filenames = tarball_mapping[filename]
rep_genomes = self.uhgg.representatives
for sid, gid in rep_genomes.items():
for i, ct in enumerate(fetched_filenames):
_fetched_file = self.get_target_layout(ct, False, sid, gid, DEFAULT_CHUNKS[i])
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][sid][ct]
assert md5_fetched == md5_lookup, f"Error for downloadding {_fetched_file} from {filename}. Please delete the folder and redownload."
if self.has_md5sum:
fetched_filenames = tarball_mapping[filename]
rep_genomes = self.uhgg.representatives
for sid, gid in rep_genomes.items():
for i, ct in enumerate(fetched_filenames):
_fetched_file = self.get_target_layout(ct, False, sid, gid, DEFAULT_CHUNKS[i])
md5_fetched = file_md5sum(_fetched_file)
md5_lookup = self.md5sum[filename][sid][ct]
assert md5_fetched == md5_lookup, f"Error for downloadding {_fetched_file} from {filename}. Please delete the folder and redownload."
return {filename: fetched_dir}

# Single File: key of the tarball layout
Expand Down
2 changes: 1 addition & 1 deletion midas2/subcommands/build_bowtie2db.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def build_bowtie2db(args):

if args.bt2_indexes_name == "repgenomes":
tsprint(f"MIDAS2::build_bowtie2_repgenomes_indexes::start")
midas_db.fetch_files("representative_genome", species_ids_of_interest)
midas_db.fetch_files("repgenome", species_ids_of_interest)
contigs_files = midas_db.fetch_files("representative_genome", species_ids_of_interest)
tsprint(contigs_files)
build_bowtie2_db(args.bt2_indexes_dir, args.bt2_indexes_name, contigs_files, args.num_cores)
Expand Down

0 comments on commit cafe7d6

Please sign in to comment.