Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge triage #48

Merged
merged 6 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions barcode_validator/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def align_to_hmm(self, sequence):
:param sequence: A BioPython SeqRecord object
:return: A BioPython SeqRecord object containing the aligned sequence
"""

# TODO: inside this method, the HMM file should be inferred from the `marker_code` attribute of the sequence.
# For example, if `marker_code` == 'COI-5P', the HMM file should be 'COI-5P.hmm', which should be set via
# self.hmmalign.set_hmmfile().
self.logger.info("Aligning sequence to HMM")
if len(sequence.seq) == 0:
self.logger.warning("Empty sequence provided for alignment")
Expand Down
5 changes: 5 additions & 0 deletions barcode_validator/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,11 @@ def validate_sequence_quality(self, config: Config, record: SeqRecord, result: D
self.logger.warning(f"Alignment failed for {result.process_id}")
result.error = f"Alignment failed for sequence '{record.seq}'"
else:

# TODO: make it so that the translation table is inferred from the BCDM annotations of the sequence.
# This should be a combination of the taxonomy and the marker code, where the latter should tell us
# if the marker is nuclear or mitochondrial and the former should tell us the translation table.
# https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
amino_acid_sequence = sh.translate_sequence(aligned_sequence, config.get('translation_table'))
result.stop_codons = sh.get_stop_codons(amino_acid_sequence)
result.seq_length = sh.marker_seqlength(aligned_sequence)
Expand Down
250 changes: 250 additions & 0 deletions barcode_validator/triage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
import argparse
import logging
import os
import csv
import sys
from Bio import SeqIO
from pathlib import Path


def setup_logging(verbosity):
"""
Configure logging format and level based on verbosity flag count.

Args:
verbosity (int): Number of verbose flags (-v) provided:
0: WARNING level (default)
1: INFO level
2: DEBUG level
"""
levels = {
0: logging.WARNING,
1: logging.INFO,
2: logging.DEBUG
}
# If verbosity is higher than 2, cap it at DEBUG level
level = levels.get(min(verbosity, 2), logging.DEBUG)

logging.basicConfig(
level=level,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.debug(f"Logging level set to: {logging.getLevelName(level)}")


def parse_args():
"""
Parse command line arguments.

Returns:
argparse.Namespace: Object containing the following attributes:
- tsv (str): Path to input TSV file with sequence metadata
- fasta (str): Path to input FASTA file with sequences
- good_dir (str): Path to output directory for good sequences
- bad_dir (str): Path to output directory for bad sequences
- verbose (int): Count of verbose flags (0-2)
"""
parser = argparse.ArgumentParser(description='Triage FASTA sequences based on quality criteria.')
parser.add_argument('--tsv', required=True, help='Input TSV file with sequence metadata')
parser.add_argument('--fasta', required=True, help='Input FASTA file with sequences')
parser.add_argument('--good_dir', required=True, help='Output directory for good sequences')
parser.add_argument('--bad_dir', required=True, help='Output directory for bad sequences')
parser.add_argument('-v', '--verbose', action='count', default=0,
help='Increase output verbosity (use -v for INFO, -v -v for DEBUG)')
return parser.parse_args()


def read_tsv_data(tsv_file):
"""
Read TSV file and return data as dictionary keyed by process_id.

Args:
tsv_file (str): Path to the input TSV file containing sequence metadata.

Returns:
dict: Dictionary where keys are process_ids and values are dictionaries
containing the corresponding row data from the TSV file.
"""
data = {}
with open(tsv_file, 'r') as f:
reader = csv.DictReader(f, delimiter='\t')
logging.debug(f"Reading TSV file: {tsv_file}")
for row in reader:
try:
data[row['process_id']] = row
except KeyError:
print(f"Error: Missing process_id in file {tsv_file}")
sys.exit(1)
logging.debug(f"Read {len(data)} records from TSV file")
return data


def check_sequence(seq_id, metadata):
"""
Check if sequence meets quality criteria.

Args:
seq_id (str): Identifier of the sequence being checked.
metadata (dict): Dictionary containing sequence metadata with the following keys:
- nuc_basecount (str): Number of nucleotide bases
- stop_codons (str): Number of stop codons
- ambig_basecount (str): Number of ambiguous bases
- obs_taxon (str): Comma-separated list of observed taxa
- identification (str): Expected taxon identification

Returns:
tuple: A pair containing:
- bool: True if sequence passes all criteria, False otherwise
- list: List of strings indicating which criteria failed (empty if all passed)
"""
failed_criteria = []
logging.debug(f"Checking sequence {seq_id}")

# Check nuc_basecount >= 500
nuc_count = 0
if metadata['nuc_basecount'] != 'None':
nuc_count = int(metadata['nuc_basecount'])
if nuc_count >= 500:
logging.info(f"Sequence {seq_id} passes nucleotide base count check ({nuc_count} bases)")
else:
logging.warning(f"Sequence {seq_id} fails nucleotide base count check ({nuc_count} bases)")
failed_criteria.append('nuc_basecount')

# Check stop_codons == 0
stop_count = int(metadata['stop_codons'])
if stop_count == 0:
logging.info(f"Sequence {seq_id} passes stop codons check (no stop codons)")
else:
logging.warning(f"Sequence {seq_id} fails stop codons check ({stop_count} stop codons)")
failed_criteria.append('stop_codons')

# Check ambig_basecount <= 6
ambig_count = 7
if metadata['ambig_basecount'] != 'None':
ambig_count = int(metadata['ambig_basecount'])
if ambig_count <= 6:
logging.info(f"Sequence {seq_id} passes ambiguous base count check ({ambig_count} ambiguous bases)")
else:
logging.warning(f"Sequence {seq_id} fails ambiguous base count check ({ambig_count} ambiguous bases)")
failed_criteria.append('ambig_basecount')

# Check if identification is in obs_taxon
obs_taxa = {taxon.strip() for taxon in metadata['obs_taxon'].split(',')}
if metadata['identification'] in obs_taxa:
logging.info(f"Sequence {seq_id} passes taxonomy check (identification: {metadata['identification']})")
else:
logging.warning(
f"Sequence {seq_id} fails taxonomy check (identification '{metadata['identification']}' "
f"not found in observed taxa: {', '.join(obs_taxa)})"
)
failed_criteria.append('taxonomy')

logging.debug(
f"Sequence {seq_id} check complete. Failed criteria: {', '.join(failed_criteria) if failed_criteria else 'none'}")
return len(failed_criteria) == 0, failed_criteria


def process_sequences(args):
"""
Process sequences and write to appropriate output directories.

Args:
args (argparse.Namespace): Command line arguments containing:
- tsv (str): Path to input TSV file
- fasta (str): Path to input FASTA file
- good_dir (str): Path to output directory for good sequences
- bad_dir (str): Path to output directory for bad sequences

Side effects:
- Creates output directories if they don't exist
- Writes filtered FASTA and TSV files to good_dir and bad_dir
- Logs processing results using the logging module
"""
# Create output directories if they don't exist
Path(args.good_dir).mkdir(parents=True, exist_ok=True)
Path(args.bad_dir).mkdir(parents=True, exist_ok=True)
logging.debug(f"Created output directories: {args.good_dir}, {args.bad_dir}")

# Read TSV data
metadata = read_tsv_data(args.tsv)

# Prepare output files
good_fasta = os.path.join(args.good_dir, os.path.basename(args.fasta))
bad_fasta = os.path.join(args.bad_dir, os.path.basename(args.fasta))
good_tsv = os.path.join(args.good_dir, os.path.basename(args.tsv))
bad_tsv = os.path.join(args.bad_dir, os.path.basename(args.tsv))

logging.debug(f"Output files prepared:\n"
f"Good FASTA: {good_fasta}\n"
f"Bad FASTA: {bad_fasta}\n"
f"Good TSV: {good_tsv}\n"
f"Bad TSV: {bad_tsv}")

# Process sequences
good_records = []
bad_records = []
good_metadata = []
bad_metadata = []

logging.info(f"Starting sequence processing from {args.fasta}")
for record in SeqIO.parse(args.fasta, 'fasta'):
seq_id = record.id.split('_')[0]
logging.debug(f"Processing sequence: {seq_id}")

if seq_id not in metadata:
logging.error(f"No metadata found for sequence {seq_id}")
continue
try:
passes_check, failed_criteria = check_sequence(seq_id, metadata[seq_id])
except KeyError as e:
logging.error(f"Error processing sequence {seq_id}: {e} in {args.tsv}")
continue

if passes_check:
good_records.append(record)
good_metadata.append(metadata[seq_id])
else:
bad_records.append(record)
bad_metadata.append(metadata[seq_id])

# Write FASTA files
SeqIO.write(good_records, good_fasta, 'fasta')
SeqIO.write(bad_records, bad_fasta, 'fasta')
logging.debug(f"Written FASTA files:\n"
f"Good sequences: {good_fasta} ({len(good_records)} records)\n"
f"Bad sequences: {bad_fasta} ({len(bad_records)} records)")

# Write TSV files
fieldnames = next(csv.DictReader(open(args.tsv), delimiter='\t')).keys()

def write_tsv(filename, data):
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
writer.writerows(data)
logging.debug(f"Written TSV file: {filename} ({len(data)} records)")

write_tsv(good_tsv, good_metadata)
write_tsv(bad_tsv, bad_metadata)

logging.info(
f"Processing complete. Processed {len(good_records)} good sequences and {len(bad_records)} bad sequences")


def main():
"""
Main function to run the script.

This function:
1. Parses command line arguments
2. Sets up logging configuration based on verbosity level
3. Processes and triages sequences based on quality criteria
"""
args = parse_args()
setup_logging(args.verbose)
process_sequences(args)


if __name__ == "__main__":
main()
Loading
Loading