Skip to content

Commit

Permalink
ENH: add formats and type for reports generated by kraken2-inspect (b…
Browse files Browse the repository at this point in the history
…okulich-lab#62)

* WIP: add formats and type for reports generated by kraken2-inspect

* format tests

* add type to init

* remove headers only on dataframe transformation

* comma

---------

Co-authored-by: Michal Ziemski <mziemski@ethz.ch>
  • Loading branch information
colinvwood and misialq authored Nov 27, 2023
1 parent 87ca3db commit 6f8c301
Show file tree
Hide file tree
Showing 13 changed files with 390 additions and 15 deletions.
9 changes: 5 additions & 4 deletions q2_types_genomics/kraken2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@
from ._format import (
Kraken2ReportFormat, Kraken2ReportDirectoryFormat,
Kraken2OutputFormat, Kraken2OutputDirectoryFormat,
Kraken2DBFormat, Kraken2DBDirectoryFormat,
BrackenDBFormat, BrackenDBDirectoryFormat
Kraken2DBFormat, Kraken2DBReportFormat, Kraken2DBReportDirectoryFormat,
Kraken2DBDirectoryFormat, BrackenDBFormat, BrackenDBDirectoryFormat
)
from ._type import Kraken2Reports, Kraken2Outputs, Kraken2DB
from ._type import Kraken2Reports, Kraken2Outputs, Kraken2DB, Kraken2DBReport

__all__ = [
'Kraken2ReportFormat', 'Kraken2ReportDirectoryFormat', 'Kraken2Reports',
'Kraken2OutputFormat', 'Kraken2OutputDirectoryFormat', 'Kraken2Outputs',
'Kraken2DBFormat', 'Kraken2DBDirectoryFormat', 'Kraken2DB',
'Kraken2DBFormat', 'Kraken2DBReportFormat', 'Kraken2DBReport',
'Kraken2DBReportDirectoryFormat', 'Kraken2DBDirectoryFormat', 'Kraken2DB',
'BrackenDBFormat', 'BrackenDBDirectoryFormat'
]

Expand Down
45 changes: 44 additions & 1 deletion q2_types_genomics/kraken2/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,48 @@ def reports_path_maker(self, sample_id, mag_id=None):
return f'{prefix}report.txt'


class Kraken2DBReportFormat(Kraken2ReportFormat):
COLUMNS = {
'perc_minimizers_covered': float,
'n_minimizers_covered': int,
'n_minimizers_assigned': int,
'rank': str,
'taxon_id': int,
'name': str
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def _to_dataframe(self):
num_headers = self._count_headers()
df = pd.read_csv(
self.path, sep='\t', header=None, skiprows=num_headers
)
if not len(df.columns) == len(self.COLUMNS):
raise ValueError(
f'Length mismatch: expected {len(self.COLUMNS)} columns, '
f'found {len(df.columns)}.'
)
return df, self.COLUMNS

def _count_headers(self):
'''
kraken2-inspect adds several headers beginning with '#' which we
wish to ignore
'''
with open(self.path, 'r') as fh:
lines = fh.readlines()

headers = filter(lambda line: line[0] == '#', lines)
return len(list(headers))


Kraken2DBReportDirectoryFormat = model.SingleFileDirectoryFormat(
'Kraken2DBReportDirectoryFormat', 'report.txt', Kraken2DBReportFormat
)


class Kraken2OutputFormat(model.TextFileFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -152,5 +194,6 @@ def kmers_path_maker(self, read_len):

plugin.register_formats(
Kraken2ReportDirectoryFormat, Kraken2OutputDirectoryFormat,
Kraken2DBDirectoryFormat, BrackenDBDirectoryFormat
Kraken2DBDirectoryFormat, Kraken2DBReportDirectoryFormat,
BrackenDBDirectoryFormat
)
9 changes: 8 additions & 1 deletion q2_types_genomics/kraken2/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd

from . import Kraken2ReportFormat, Kraken2OutputFormat
from . import Kraken2ReportFormat, Kraken2OutputFormat, Kraken2DBReportFormat
from ..plugin_setup import plugin


Expand All @@ -24,3 +24,10 @@ def _2(ff: Kraken2OutputFormat) -> pd.DataFrame:
df, cols = ff._to_dataframe()
df.columns = cols
return df


@plugin.register_transformer
def _3(ff: Kraken2DBReportFormat) -> pd.DataFrame:
df, cols = ff._to_dataframe()
df.columns = cols.keys()
return df
10 changes: 8 additions & 2 deletions q2_types_genomics/kraken2/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

from . import (
Kraken2ReportDirectoryFormat, Kraken2OutputDirectoryFormat,
Kraken2DBDirectoryFormat, BrackenDBDirectoryFormat
Kraken2DBDirectoryFormat, Kraken2DBReportDirectoryFormat,
BrackenDBDirectoryFormat
)
from ..plugin_setup import plugin

Expand All @@ -25,10 +26,11 @@
variant_of=[SampleData.field['type'], FeatureData.field['type']]
)
Kraken2DB = SemanticType('Kraken2DB')
Kraken2DBReport = SemanticType('Kraken2DBReport')
BrackenDB = SemanticType('BrackenDB')

plugin.register_semantic_types(
Kraken2Reports, Kraken2Outputs, Kraken2DB, BrackenDB
Kraken2Reports, Kraken2Outputs, Kraken2DB, Kraken2DBReport, BrackenDB
)

plugin.register_semantic_type_to_format(
Expand All @@ -51,6 +53,10 @@
Kraken2DB,
artifact_format=Kraken2DBDirectoryFormat
)
plugin.register_semantic_type_to_format(
Kraken2DBReport,
artifact_format=Kraken2DBReportDirectoryFormat
)
plugin.register_semantic_type_to_format(
BrackenDB,
artifact_format=BrackenDBDirectoryFormat
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Database options: nucleotide db, k = 35, l = 31
# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011
# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101
# Total taxonomy nodes: 46
# Table size: 26047
# Table capacity: 51565
# Min clear hash value = 0
100.00 26047 0 R 1 root
100.00 26047 0 R1 131567 cellular organisms
75.81 19746 0 D 2 Bacteria
75.81 19746 0 D1 1783272 Terrabacteria group
75.81 19746 0 P 1239 Bacillota
75.81 19746 0 C 91061 Bacilli
75.81 19746 0 O 1385 Bacillales
49.84 12983 0 F 90964 Staphylococcaceae
49.84 12983 0 G 1279 Staphylococcus
25.11 6540 6540 S 1282 Staphylococcus epidermidis
24.74 6443 6443 S 1280 Staphylococcus aureus
25.96 6763 0 F 186817 Bacillaceae
25.96 6763 0 G 1386 Bacillus
25.96 6763 0 G1 86661 Bacillus cereus group
25.96 6763 6763 S 1392 Bacillus anthracis
24.19 6301 0 D 2759 Eukaryota
24.19 6301 0 D1 33154 Opisthokonta
24.19 6301 0 K 33208 Metazoa
24.19 6301 0 K1 6072 Eumetazoa
24.19 6301 0 K2 33213 Bilateria
24.19 6301 0 K3 33511 Deuterostomia
24.19 6301 0 P 7711 Chordata
24.19 6301 0 P1 89593 Craniata
24.19 6301 0 P2 7742 Vertebrata
24.19 6301 0 P3 7776 Gnathostomata
24.19 6301 0 P4 117570 Teleostomi
24.19 6301 0 P5 117571 Euteleostomi
24.19 6301 0 P6 8287 Sarcopterygii
24.19 6301 0 P7 1338369 Dipnotetrapodomorpha
24.19 6301 0 P8 32523 Tetrapoda
24.19 6301 0 P9 32524 Amniota
24.19 6301 0 C 40674 Mammalia
24.19 6301 0 C1 32525 Theria
24.19 6301 0 C2 9347 Eutheria
24.19 6301 0 C3 1437010 Boreoeutheria
24.19 6301 0 C4 314146 Euarchontoglires
24.19 6301 0 C5 314147 Glires
24.19 6301 0 O 9989 Rodentia
24.19 6301 0 O1 1963758 Myomorpha
24.19 6301 0 O2 337687 Muroidea
24.19 6301 0 F 10066 Muridae
24.19 6301 0 F1 39107 Murinae
24.19 6301 0 G 10088 Mus
24.19 6301 0 G1 862507 Mus
24.19 6301 6301 S 10090 Mus musculus
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Database options: nucleotide db, k = 35, l = 31
# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011
# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101
# Total taxonomy nodes: 46
# Table size: 26047
# Table capacity: 51565
# Min clear hash value = 0
26047 0 R 1 root
26047 0 R1 131567 cellular organisms
19746 0 D 2 Bacteria
19746 0 D1 1783272 Terrabacteria group
19746 0 P 1239 Bacillota
19746 0 C 91061 Bacilli
19746 0 O 1385 Bacillales
12983 0 F 90964 Staphylococcaceae
12983 0 G 1279 Staphylococcus
6540 6540 S 1282 Staphylococcus epidermidis
6443 6443 S 1280 Staphylococcus aureus
6763 0 F 186817 Bacillaceae
6763 0 G 1386 Bacillus
6763 0 G1 86661 Bacillus cereus group
6763 6763 S 1392 Bacillus anthracis
6301 0 D 2759 Eukaryota
6301 0 D1 33154 Opisthokonta
6301 0 K 33208 Metazoa
6301 0 K1 6072 Eumetazoa
6301 0 K2 33213 Bilateria
6301 0 K3 33511 Deuterostomia
6301 0 P 7711 Chordata
6301 0 P1 89593 Craniata
6301 0 P2 7742 Vertebrata
6301 0 P3 7776 Gnathostomata
6301 0 P4 117570 Teleostomi
6301 0 P5 117571 Euteleostomi
6301 0 P6 8287 Sarcopterygii
6301 0 P7 1338369 Dipnotetrapodomorpha
6301 0 P8 32523 Tetrapoda
6301 0 P9 32524 Amniota
6301 0 C 40674 Mammalia
6301 0 C1 32525 Theria
6301 0 C2 9347 Eutheria
6301 0 C3 1437010 Boreoeutheria
6301 0 C4 314146 Euarchontoglires
6301 0 C5 314147 Glires
6301 0 O 9989 Rodentia
6301 0 O1 1963758 Myomorpha
6301 0 O2 337687 Muroidea
6301 0 F 10066 Muridae
6301 0 F1 39107 Murinae
6301 0 G 10088 Mus
6301 0 G1 862507 Mus
6301 6301 S 10090 Mus musculus
46 changes: 46 additions & 0 deletions q2_types_genomics/kraken2/tests/data/db-reports/report-ok.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
perc_minimizers_covered,n_minimizers_covered,n_minimizers_assigned,rank,taxon_id,name
100.00,26047,0,R,1,root
100.00,26047,0,R1,131567, cellular organisms
75.81,19746,0,D,2, Bacteria
75.81,19746,0,D1,1783272, Terrabacteria group
75.81,19746,0,P,1239, Bacillota
75.81,19746,0,C,91061, Bacilli
75.81,19746,0,O,1385, Bacillales
49.84,12983,0,F,90964, Staphylococcaceae
49.84,12983,0,G,1279, Staphylococcus
25.11,6540,6540,S,1282, Staphylococcus epidermidis
24.74,6443,6443,S,1280, Staphylococcus aureus
25.96,6763,0,F,186817, Bacillaceae
25.96,6763,0,G,1386, Bacillus
25.96,6763,0,G1,86661, Bacillus cereus group
25.96,6763,6763,S,1392, Bacillus anthracis
24.19,6301,0,D,2759, Eukaryota
24.19,6301,0,D1,33154, Opisthokonta
24.19,6301,0,K,33208, Metazoa
24.19,6301,0,K1,6072, Eumetazoa
24.19,6301,0,K2,33213, Bilateria
24.19,6301,0,K3,33511, Deuterostomia
24.19,6301,0,P,7711, Chordata
24.19,6301,0,P1,89593, Craniata
24.19,6301,0,P2,7742, Vertebrata
24.19,6301,0,P3,7776, Gnathostomata
24.19,6301,0,P4,117570, Teleostomi
24.19,6301,0,P5,117571, Euteleostomi
24.19,6301,0,P6,8287, Sarcopterygii
24.19,6301,0,P7,1338369, Dipnotetrapodomorpha
24.19,6301,0,P8,32523, Tetrapoda
24.19,6301,0,P9,32524, Amniota
24.19,6301,0,C,40674, Mammalia
24.19,6301,0,C1,32525, Theria
24.19,6301,0,C2,9347, Eutheria
24.19,6301,0,C3,1437010, Boreoeutheria
24.19,6301,0,C4,314146, Euarchontoglires
24.19,6301,0,C5,314147, Glires
24.19,6301,0,O,9989, Rodentia
24.19,6301,0,O1,1963758, Myomorpha
24.19,6301,0,O2,337687, Muroidea
24.19,6301,0,F,10066, Muridae
24.19,6301,0,F1,39107, Murinae
24.19,6301,0,G,10088, Mus
24.19,6301,0,G1,862507, Mus
24.19,6301,6301,S,10090, Mus musculus
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Database options: nucleotide db, k = 35, l = 31
# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011
# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101
# Total taxonomy nodes: 46
# Table size: 26047
# Table capacity: 51565
# Min clear hash value = 0
100 26047 0 R 1 root
100 26047 0 R1 131567 cellular organisms
75 19746 0 D 2 Bacteria
75 19746 0 D1 1783272 Terrabacteria group
75 19746 0 P 1239 Bacillota
75 19746 0 C 91061 Bacilli
75 19746 0 O 1385 Bacillales
49 12983 0 F 90964 Staphylococcaceae
49 12983 0 G 1279 Staphylococcus
25 6540 6540 S 1282 Staphylococcus epidermidis
24 6443 6443 S 1280 Staphylococcus aureus
25 6763 0 F 186817 Bacillaceae
25 6763 0 G 1386 Bacillus
25 6763 0 G1 86661 Bacillus cereus group
25 6763 6763 S 1392 Bacillus anthracis
24 6301 0 D 2759 Eukaryota
24 6301 0 D1 33154 Opisthokonta
24 6301 0 K 33208 Metazoa
24 6301 0 K1 6072 Eumetazoa
24 6301 0 K2 33213 Bilateria
24 6301 0 K3 33511 Deuterostomia
24 6301 0 P 7711 Chordata
24 6301 0 P1 89593 Craniata
24 6301 0 P2 7742 Vertebrata
24 6301 0 P3 7776 Gnathostomata
24 6301 0 P4 117570 Teleostomi
24 6301 0 P5 117571 Euteleostomi
24 6301 0 P6 8287 Sarcopterygii
24 6301 0 P7 1338369 Dipnotetrapodomorpha
24 6301 0 P8 32523 Tetrapoda
24 6301 0 P9 32524 Amniota
24 6301 0 C 40674 Mammalia
24 6301 0 C1 32525 Theria
24 6301 0 C2 9347 Eutheria
24 6301 0 C3 1437010 Boreoeutheria
24 6301 0 C4 314146 Euarchontoglires
24 6301 0 C5 314147 Glires
24 6301 0 O 9989 Rodentia
24 6301 0 O1 1963758 Myomorpha
24 6301 0 O2 337687 Muroidea
24 6301 0 F 10066 Muridae
24 6301 0 F1 39107 Murinae
24 6301 0 G 10088 Mus
24 6301 0 G1 862507 Mus
24 6301 6301 S 10090 Mus musculus
52 changes: 52 additions & 0 deletions q2_types_genomics/kraken2/tests/data/db-reports/report.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Database options: nucleotide db, k = 35, l = 31
# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011
# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101
# Total taxonomy nodes: 46
# Table size: 26047
# Table capacity: 51565
# Min clear hash value = 0
100.00 26047 0 R 1 root
100.00 26047 0 R1 131567 cellular organisms
75.81 19746 0 D 2 Bacteria
75.81 19746 0 D1 1783272 Terrabacteria group
75.81 19746 0 P 1239 Bacillota
75.81 19746 0 C 91061 Bacilli
75.81 19746 0 O 1385 Bacillales
49.84 12983 0 F 90964 Staphylococcaceae
49.84 12983 0 G 1279 Staphylococcus
25.11 6540 6540 S 1282 Staphylococcus epidermidis
24.74 6443 6443 S 1280 Staphylococcus aureus
25.96 6763 0 F 186817 Bacillaceae
25.96 6763 0 G 1386 Bacillus
25.96 6763 0 G1 86661 Bacillus cereus group
25.96 6763 6763 S 1392 Bacillus anthracis
24.19 6301 0 D 2759 Eukaryota
24.19 6301 0 D1 33154 Opisthokonta
24.19 6301 0 K 33208 Metazoa
24.19 6301 0 K1 6072 Eumetazoa
24.19 6301 0 K2 33213 Bilateria
24.19 6301 0 K3 33511 Deuterostomia
24.19 6301 0 P 7711 Chordata
24.19 6301 0 P1 89593 Craniata
24.19 6301 0 P2 7742 Vertebrata
24.19 6301 0 P3 7776 Gnathostomata
24.19 6301 0 P4 117570 Teleostomi
24.19 6301 0 P5 117571 Euteleostomi
24.19 6301 0 P6 8287 Sarcopterygii
24.19 6301 0 P7 1338369 Dipnotetrapodomorpha
24.19 6301 0 P8 32523 Tetrapoda
24.19 6301 0 P9 32524 Amniota
24.19 6301 0 C 40674 Mammalia
24.19 6301 0 C1 32525 Theria
24.19 6301 0 C2 9347 Eutheria
24.19 6301 0 C3 1437010 Boreoeutheria
24.19 6301 0 C4 314146 Euarchontoglires
24.19 6301 0 C5 314147 Glires
24.19 6301 0 O 9989 Rodentia
24.19 6301 0 O1 1963758 Myomorpha
24.19 6301 0 O2 337687 Muroidea
24.19 6301 0 F 10066 Muridae
24.19 6301 0 F1 39107 Murinae
24.19 6301 0 G 10088 Mus
24.19 6301 0 G1 862507 Mus
24.19 6301 6301 S 10090 Mus musculus
Loading

0 comments on commit 6f8c301

Please sign in to comment.