Skip to content

Commit d27fd4e

Browse files
josenavasantgonza
authored andcommitted
Removing skbio dependency (#2)
* Removing skbio dependency * Improving docs * Removing doctests * Fixing is string or bytes * Fixing py2-3
1 parent c91a5ec commit d27fd4e

File tree

20 files changed

+2338
-64
lines changed

20 files changed

+2338
-64
lines changed

qiita_files/demux.py

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,11 @@
5656
import numpy as np
5757
from future.utils import viewitems, viewvalues
5858
from future.builtins import zip
59-
from skbio.parse.sequences import load
60-
from skbio.format.sequences import format_fastq_record
6159

62-
from .util import open_file
60+
from qiita_files.parse import load
61+
from qiita_files.format.fasta import format_fasta_record
62+
from qiita_files.format.fastq import format_fastq_record
63+
from qiita_files.util import open_file
6364

6465

6566
# track some basic stats about the samples
@@ -72,6 +73,8 @@
7273
'barcode_error': 'barcode/error',
7374
'qual': 'qual'}
7475

76+
dset_paths_bytes = {k: v.encode('ascii') for k, v in dset_paths.items()}
77+
7578

7679
class _buffer(object):
7780
"""Buffer baseclass that sits on top of an HDF5 dataset
@@ -386,26 +389,6 @@ def to_hdf5(fp, h5file, max_barcode_length=12):
386389
buffers[pjoin(dset_paths['qual'])].write(qual)
387390

388391

389-
def format_fasta_record(seqid, seq, qual):
390-
"""Format a fasta record
391-
392-
Parameters
393-
----------
394-
seqid : str
395-
The sequence ID
396-
seq : str
397-
The sequence
398-
qual : ignored
399-
This is ignored
400-
401-
Returns
402-
-------
403-
str
404-
A formatted sequence record
405-
"""
406-
return b'\n'.join([b'>' + seqid, seq, b''])
407-
408-
409392
def to_ascii(demux, samples=None):
410393
"""Consume a demuxed HDF5 file and yield sequence records
411394
@@ -429,15 +412,15 @@ def to_ascii(demux, samples=None):
429412
else:
430413
formatter = format_fasta_record
431414

432-
id_fmt = ("%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
433-
"bc_diffs=%(bc_diff)d")
415+
id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
416+
b"bc_diffs=%(bc_diff)d")
434417

435418
if samples is None:
436419
samples = demux.keys()
437420

438421
for samp, idx, seq, qual, bc_ori, bc_cor, bc_err in fetch(demux, samples):
439-
seq_id = id_fmt % {'sample': samp, 'idx': idx, 'bc_ori': bc_ori,
440-
'bc_cor': bc_cor, 'bc_diff': bc_err}
422+
seq_id = id_fmt % {b'sample': samp, b'idx': idx, b'bc_ori': bc_ori,
423+
b'bc_cor': bc_cor, b'bc_diff': bc_err}
441424
if qual != []:
442425
qual = qual.astype(np.uint8)
443426

@@ -468,6 +451,7 @@ def to_per_sample_ascii(demux, samples=None):
468451
samples = demux.keys()
469452

470453
for samp in samples:
454+
samp = samp.encode()
471455
yield samp, to_ascii(demux, samples=[samp])
472456

473457

@@ -513,20 +497,22 @@ def fetch(demux, samples=None, k=None):
513497
indices = np.logical_not(indices)
514498
indices[to_keep[:k]] = True
515499

516-
seqs = demux[pjoin(dset_paths['sequence'])][indices]
500+
seqs = demux[pjoin(dset_paths_bytes['sequence'])][indices]
517501

518502
# only yield qual if we have it
519503
quals = repeat([])
520504
if demux.attrs['has-qual']:
521505
if len(indices) == 1:
522506
if indices[0]:
523-
quals = demux[pjoin(dset_paths['qual'])][:]
507+
quals = demux[pjoin(dset_paths_bytes['qual'])][:]
524508
else:
525-
quals = demux[pjoin(dset_paths['qual'])][indices, :]
509+
quals = demux[pjoin(dset_paths_bytes['qual'])][indices, :]
526510

527-
bc_original = demux[pjoin(dset_paths['barcode_original'])][indices]
528-
bc_corrected = demux[pjoin(dset_paths['barcode_corrected'])][indices]
529-
bc_error = demux[pjoin(dset_paths['barcode_error'])][indices]
511+
bc_original = demux[
512+
pjoin(dset_paths_bytes['barcode_original'])][indices]
513+
bc_corrected = demux[
514+
pjoin(dset_paths_bytes['barcode_corrected'])][indices]
515+
bc_error = demux[pjoin(dset_paths_bytes['barcode_error'])][indices]
530516

531517
iter_ = zip(repeat(sample), np.arange(indices.size)[indices], seqs,
532518
quals, bc_original, bc_corrected, bc_error)

qiita_files/format/__init__.py

Whitespace-only changes.

qiita_files/format/fasta.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# -----------------------------------------------------------------------------
2+
# Copyright (c) 2014--, The Qiita Development Team.
3+
#
4+
# Distributed under the terms of the BSD 3-clause License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# -----------------------------------------------------------------------------
8+
9+
10+
def format_fasta_record(seqid, seq, qual):
11+
"""Format a fasta record
12+
13+
Parameters
14+
----------
15+
seqid : str
16+
The sequence ID
17+
seq : str
18+
The sequence
19+
qual : ignored
20+
This is ignored
21+
22+
Returns
23+
-------
24+
str
25+
A formatted sequence record
26+
"""
27+
return b'\n'.join([b'>' + seqid, seq, b''])

qiita_files/format/fastq.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# -----------------------------------------------------------------------------
2+
# Copyright (c) 2014--, The Qiita Development Team.
3+
#
4+
# Distributed under the terms of the BSD 3-clause License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# -----------------------------------------------------------------------------
8+
9+
10+
def _phred_to_ascii(a, offset):
11+
"""Convert Phred quality score to ASCII character with specified offset"""
12+
return (a + offset).tobytes()
13+
14+
15+
def _phred_to_ascii33(a):
16+
"""Convert Phred quality score to ASCII character with offset of 33"""
17+
return _phred_to_ascii(a, 33)
18+
19+
20+
def _phred_to_ascii64(a):
21+
"""Convert Phred quality score to ASCII character with offset of 64"""
22+
return _phred_to_ascii(a, 64)
23+
24+
25+
def format_fastq_record(seqid, seq, qual, phred_offset=33):
26+
"""Format a FASTQ record
27+
28+
Parameters
29+
----------
30+
seqid : bytes
31+
The sequence ID
32+
seq : bytes
33+
The sequence
34+
qual : np.array of int8
35+
The quality scores
36+
phred_offset : int, either 33 or 64
37+
Set a phred offset
38+
39+
Returns
40+
-------
41+
bytes : a string representation of a single FASTQ record
42+
"""
43+
if phred_offset == 33:
44+
phred_f = _phred_to_ascii33
45+
elif phred_offset == 64:
46+
phred_f = _phred_to_ascii64
47+
else:
48+
raise ValueError("Unknown phred offset: %d" % phred_offset)
49+
50+
return b'\n'.join([b"@" + seqid, seq, b'+', phred_f(qual), b''])

qiita_files/format/tests/__init__.py

Whitespace-only changes.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# -----------------------------------------------------------------------------
2+
# Copyright (c) 2014--, The Qiita Development Team.
3+
#
4+
# Distributed under the terms of the BSD 3-clause License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# -----------------------------------------------------------------------------
8+
9+
from unittest import TestCase, main
10+
from qiita_files.format.fasta import format_fasta_record
11+
12+
13+
class FastaTests(TestCase):
14+
def test_format_fasta_record(self):
15+
exp = b">a\nxyz\n"
16+
obs = format_fasta_record(b"a", b"xyz", b'ignored')
17+
self.assertEqual(obs, exp)
18+
19+
if __name__ == '__main__':
20+
main()
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# -----------------------------------------------------------------------------
2+
# Copyright (c) 2014--, The Qiita Development Team.
3+
#
4+
# Distributed under the terms of the BSD 3-clause License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# -----------------------------------------------------------------------------
8+
9+
from unittest import TestCase, main
10+
import numpy as np
11+
12+
from qiita_files.format.fastq import (format_fastq_record, _phred_to_ascii33,
13+
_phred_to_ascii64)
14+
15+
16+
class FastqTests(TestCase):
17+
def setUp(self):
18+
self.qual_scores = np.array([38, 39, 40], dtype=np.int8)
19+
self.args = (b'abc', b'def', self.qual_scores)
20+
21+
def test_format_fastq_record_phred_offset_33(self):
22+
exp = b"@abc\ndef\n+\nGHI\n"
23+
obs = format_fastq_record(*self.args, phred_offset=33)
24+
self.assertEqual(obs, exp)
25+
26+
def test_format_fastq_record_phred_offset_64(self):
27+
exp = b"@abc\ndef\n+\nfgh\n"
28+
obs = format_fastq_record(*self.args, phred_offset=64)
29+
self.assertEqual(obs, exp)
30+
31+
def test_format_fastq_record_invalid_phred_offset(self):
32+
with self.assertRaises(ValueError):
33+
format_fastq_record(*self.args, phred_offset=42)
34+
35+
def test_phred_to_ascii33(self):
36+
obs = _phred_to_ascii33(self.qual_scores)
37+
self.assertEqual(obs, b'GHI')
38+
39+
def test_phred_to_ascii64(self):
40+
obs = _phred_to_ascii64(self.qual_scores)
41+
self.assertEqual(obs, b'fgh')
42+
43+
if __name__ == '__main__':
44+
main()

qiita_files/parse/__init__.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2013--, scikit-bio development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file COPYING.txt, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
from itertools import chain
10+
from gzip import open as gzip_open
11+
from os.path import exists
12+
13+
from .iterator import FastaIterator, FastqIterator
14+
15+
FILEEXT_MAP = {'fna': (FastaIterator, open),
16+
'fna.gz': (FastaIterator, gzip_open),
17+
'fasta': (FastaIterator, open),
18+
'fasta.gz': (FastaIterator, gzip_open),
19+
'qual': (FastaIterator, open),
20+
'qual.gz': (FastaIterator, gzip_open),
21+
'fastq': (FastqIterator, open),
22+
'fastq.gz': (FastqIterator, gzip_open),
23+
'fq': (FastqIterator, open),
24+
'fq.gz': (FastqIterator, gzip_open)}
25+
26+
27+
def _determine_types_and_openers(files):
28+
"""Attempt to determine the appropriate iterators and openers"""
29+
if files is None:
30+
return [], []
31+
32+
iters = []
33+
openers = []
34+
for fpath in files:
35+
if fpath.endswith('.gz'):
36+
ext = '.'.join(fpath.rsplit('.', 2)[-2:])
37+
else:
38+
ext = fpath.rsplit('.', 1)[-1]
39+
40+
i, o = FILEEXT_MAP.get(ext, (None, None))
41+
if i is None:
42+
raise IOError("Unknown filetype for %s" % fpath)
43+
44+
iters.append(i)
45+
openers.append(o)
46+
47+
return iters, openers
48+
49+
50+
def _is_single_iterator_type(iters):
51+
"""Determine if there is a single or multiple type of iterator
52+
If iters is [], this method returns True it considers the null case to be
53+
a single iterator type.
54+
"""
55+
if iters:
56+
return len(set(iters)) == 1
57+
else:
58+
return True
59+
60+
61+
def _open_or_none(opener, f):
62+
"""Open a file or returns None"""
63+
if not opener:
64+
return None
65+
else:
66+
name = opener.__name__
67+
68+
if not exists(f):
69+
raise IOError("%s does not appear to exist!" % f)
70+
try:
71+
opened = opener(f)
72+
except IOError:
73+
raise IOError("Could not open %s with %s!" % (f, name))
74+
75+
return opened
76+
77+
78+
def load(seqs, qual=None, constructor=None, **kwargs):
79+
"""Construct the appropriate iterator for all your processing needs
80+
This method will attempt to open all files correctly and to feed the
81+
appropriate objects into the correct iterators.
82+
Seqs can list multiple types of files (e.g., FASTA and FASTQ), but if
83+
multiple file types are specified, qual must be None
84+
Parameters
85+
----------
86+
seqs : str or list of sequence file paths
87+
qual : str or list of qual file paths or None
88+
constructor : force a constructor on seqs
89+
kwargs : dict
90+
passed into the subsequent generators.
91+
Returns
92+
-------
93+
SequenceIterator
94+
the return is ``Iterable``
95+
See Also
96+
--------
97+
SequenceIterator
98+
FastaIterator
99+
FastqIterator
100+
"""
101+
if not seqs:
102+
raise ValueError("Must supply sequences.")
103+
104+
if isinstance(seqs, str):
105+
seqs = [seqs]
106+
107+
if isinstance(qual, str):
108+
qual = [qual]
109+
110+
# i -> iters, o -> openers
111+
if constructor is not None:
112+
i_seqs = [constructor] * len(seqs)
113+
o_seqs = [open] * len(seqs)
114+
else:
115+
i_seqs, o_seqs = _determine_types_and_openers(seqs)
116+
117+
i_qual, o_qual = _determine_types_and_openers(qual)
118+
119+
seqs = [_open_or_none(o, f) for f, o in zip(seqs, o_seqs)]
120+
qual = [_open_or_none(o, f) for f, o in zip(qual or [], o_qual or [])]
121+
122+
if not qual:
123+
qual = None
124+
125+
if not _is_single_iterator_type(i_seqs) and qual is not None:
126+
# chaining Fasta/Fastq for sequence is easy, but it gets nasty quick
127+
# if seqs is a mix of fasta/fastq, with qual coming in as there aren't
128+
# 1-1 mappings. This could be addressed if necessary, but seems like
129+
# an unnecessary block of code right now
130+
raise ValueError("Cannot handle multiple sequence file types and qual "
131+
"file(s) at the same time.")
132+
133+
if _is_single_iterator_type(i_seqs):
134+
seqs_constructor = i_seqs[0]
135+
gen = seqs_constructor(seq=seqs, qual=qual, **kwargs)
136+
else:
137+
gen = chain(*[c(seq=[fp], **kwargs) for c, fp in zip(i_seqs, seqs)])
138+
139+
return gen

0 commit comments

Comments
 (0)