Skip to content
This repository has been archived by the owner on May 21, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into tax_issue_66
Browse files Browse the repository at this point in the history
  • Loading branch information
Sann5 authored Dec 11, 2023
2 parents 4b9be80 + 22fd9a3 commit e82098f
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 16 deletions.
42 changes: 39 additions & 3 deletions q2_types_genomics/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import re

from q2_types.feature_data import DNAFASTAFormat
from q2_types_genomics.genome_data._format import OrthologFileFmt
from qiime2.plugin import model
Expand All @@ -14,16 +16,50 @@


class MAGSequencesDirFmt(model.DirectoryFormat):
sequences = model.FileCollection(
pathspec = (
r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-4[0-9a-fA-F]{3}-"
r"[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\.(fa|fasta)$",
format=DNAFASTAFormat
r"[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\.(fa|fasta)$"
)

sequences = model.FileCollection(pathspec, format=DNAFASTAFormat)

@sequences.set_path_maker
def sequences_path_maker(self, mag_id):
return r'%s.fasta' % mag_id

def feature_dict(self, relative=False):
'''
Returns a mapping of mag id to filepath for each mag.
Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.
Returns
-------
dict
Mapping of feature id -> filepath as described above. Sorted
alphabetically by key.
'''
pattern = re.compile(self.pathspec)
ids = {}
for path in self.path.iterdir():
if not pattern.match(path.name):
continue

id = path.stem
absolute_path = path.absolute()
if relative:
ids[id] = str(
absolute_path.relative_to(self.path.absolute())
)
else:
ids[id] = str(absolute_path)

return dict(sorted(ids.items()))


plugin.register_formats(MAGSequencesDirFmt)

Expand Down
29 changes: 29 additions & 0 deletions q2_types_genomics/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from pathlib import Path
import shutil
import unittest

from qiime2.plugin.testing import TestPluginBase
Expand All @@ -31,6 +33,33 @@ def test_mag_dirfmt_fasta(self):

format.validate()

def test_mag_dirfmt_feature_dict(self):
dirpath = self.get_data_path('mags-fasta')
shutil.copytree(dirpath, self.temp_dir.name, dirs_exist_ok=True)
mags = MAGSequencesDirFmt(self.temp_dir.name, mode='r')

# non-mags should not be collected
with open(Path(self.temp_dir.name) / 'not-a-mag.fasta', 'w') as fh:
fh.write('not a mag')

obs = mags.feature_dict()
exp = {
'3b7d53fb-5b60-46c6-8819-aeda065b12e9':
str(mags.path / '3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta'),
'6232c7e1-8ed7-47c8-9bdb-b94706a26931':
str(mags.path / '6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta'),
}
self.assertEqual(obs, exp)

obs = mags.feature_dict(relative=True)
exp = {
'3b7d53fb-5b60-46c6-8819-aeda065b12e9':
'3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta',
'6232c7e1-8ed7-47c8-9bdb-b94706a26931':
'6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta',
}
self.assertEqual(obs, exp)

def test_ortholog_annotation_dir_fmt_passing(self):
dirpath = self.get_data_path('good_ortholog_annotation')
fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r')
Expand Down
27 changes: 14 additions & 13 deletions q2_types_genomics/per_sample_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,18 @@ class MultiBowtie2IndexDirFmt(MultiDirValidationMixin, Bowtie2IndexDirFmt):


class ContigSequencesDirFmt(model.DirectoryFormat):
sequences = model.FileCollection(r'.+_contigs\.(fa|fasta)$',
format=DNAFASTAFormat)
pathspec = r'[^\.].+_contigs.(fasta|fa)$'

sequences = model.FileCollection(pathspec, format=DNAFASTAFormat)

@sequences.set_path_maker
def sequences_path_maker(self, sample_id):
return r'%s_contigs.fasta' % sample_id

def sample_dict(self, relative=False):
'''
Returns a mapping of sample id to file path for each set of per-sample
contigs in the directory format.
Returns a mapping of sample id to filepath for each set of per-sample
contigs.
Parameters
---------
Expand All @@ -144,22 +145,22 @@ def sample_dict(self, relative=False):
Mapping of sample id -> filepath as described above. Sorted
alphabetically by key.
'''
contigs_pattern = re.compile(r'[^\.].+_contigs.(fasta|fa)$')
samples = {}
for sample_path in self.path.iterdir():
if not contigs_pattern.match(sample_path.name):
contigs_pattern = re.compile(self.pathspec)
ids = {}
for path in self.path.iterdir():
if not contigs_pattern.match(path.name):
continue

sample_id = sample_path.name.rsplit('_contigs', 1)[0]
absolute_path = sample_path.absolute()
id = path.name.rsplit('_contigs', 1)[0]
absolute_path = path.absolute()
if relative:
samples[sample_id] = str(
ids[id] = str(
absolute_path.relative_to(self.path.absolute())
)
else:
samples[sample_id] = str(absolute_path)
ids[id] = str(absolute_path)

return dict(sorted(samples.items()))
return dict(sorted(ids.items()))


# borrowed from q2-phylogenomics
Expand Down

0 comments on commit e82098f

Please sign in to comment.