diff --git a/barcode_validator/alignment.py b/barcode_validator/alignment.py index 3b7be84..9403e28 100644 --- a/barcode_validator/alignment.py +++ b/barcode_validator/alignment.py @@ -47,6 +47,10 @@ def align_to_hmm(self, sequence): :param sequence: A BioPython SeqRecord object :return: A BioPython SeqRecord object containing the aligned sequence """ + + # TODO: inside this method, the HMM file should be inferred from the `marker_code` attribute of the sequence. + # For example, if `marker_code` == 'COI-5P', the HMM file should be 'COI-5P.hmm', which should be set via + # self.hmmalign.set_hmmfile(). self.logger.info("Aligning sequence to HMM") if len(sequence.seq) == 0: self.logger.warning("Empty sequence provided for alignment") diff --git a/barcode_validator/core.py b/barcode_validator/core.py index ac66964..1f97b36 100644 --- a/barcode_validator/core.py +++ b/barcode_validator/core.py @@ -161,6 +161,11 @@ def validate_sequence_quality(self, config: Config, record: SeqRecord, result: D self.logger.warning(f"Alignment failed for {result.process_id}") result.error = f"Alignment failed for sequence '{record.seq}'" else: + + # TODO: make it so that the translation table is inferred from the BCDM annotations of the sequence. + # This should be a combination of the taxonomy and the marker code, where the latter should tell us + # if the marker is nuclear or mitochondrial and the former should tell us the translation table. + # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi amino_acid_sequence = sh.translate_sequence(aligned_sequence, config.get('translation_table')) result.stop_codons = sh.get_stop_codons(amino_acid_sequence) result.seq_length = sh.marker_seqlength(aligned_sequence) diff --git a/barcode_validator/triage.py b/barcode_validator/triage.py new file mode 100644 index 0000000..2573f49 --- /dev/null +++ b/barcode_validator/triage.py @@ -0,0 +1,250 @@ +import argparse +import logging +import os +import csv +import sys +from Bio import SeqIO +from pathlib import Path + + +def setup_logging(verbosity): + """ + Configure logging format and level based on verbosity flag count. + + Args: + verbosity (int): Number of verbose flags (-v) provided: + 0: WARNING level (default) + 1: INFO level + 2: DEBUG level + """ + levels = { + 0: logging.WARNING, + 1: logging.INFO, + 2: logging.DEBUG + } + # If verbosity is higher than 2, cap it at DEBUG level + level = levels.get(min(verbosity, 2), logging.DEBUG) + + logging.basicConfig( + level=level, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + logging.debug(f"Logging level set to: {logging.getLevelName(level)}") + + +def parse_args(): + """ + Parse command line arguments. + + Returns: + argparse.Namespace: Object containing the following attributes: + - tsv (str): Path to input TSV file with sequence metadata + - fasta (str): Path to input FASTA file with sequences + - good_dir (str): Path to output directory for good sequences + - bad_dir (str): Path to output directory for bad sequences + - verbose (int): Count of verbose flags (0-2) + """ + parser = argparse.ArgumentParser(description='Triage FASTA sequences based on quality criteria.') + parser.add_argument('--tsv', required=True, help='Input TSV file with sequence metadata') + parser.add_argument('--fasta', required=True, help='Input FASTA file with sequences') + parser.add_argument('--good_dir', required=True, help='Output directory for good sequences') + parser.add_argument('--bad_dir', required=True, help='Output directory for bad sequences') + parser.add_argument('-v', '--verbose', action='count', default=0, + help='Increase output verbosity (use -v for INFO, -v -v for DEBUG)') + return parser.parse_args() + + +def read_tsv_data(tsv_file): + """ + Read TSV file and return data as dictionary keyed by process_id. + + Args: + tsv_file (str): Path to the input TSV file containing sequence metadata. + + Returns: + dict: Dictionary where keys are process_ids and values are dictionaries + containing the corresponding row data from the TSV file. + """ + data = {} + with open(tsv_file, 'r') as f: + reader = csv.DictReader(f, delimiter='\t') + logging.debug(f"Reading TSV file: {tsv_file}") + for row in reader: + try: + data[row['process_id']] = row + except KeyError: + print(f"Error: Missing process_id in file {tsv_file}") + sys.exit(1) + logging.debug(f"Read {len(data)} records from TSV file") + return data + + +def check_sequence(seq_id, metadata): + """ + Check if sequence meets quality criteria. + + Args: + seq_id (str): Identifier of the sequence being checked. + metadata (dict): Dictionary containing sequence metadata with the following keys: + - nuc_basecount (str): Number of nucleotide bases + - stop_codons (str): Number of stop codons + - ambig_basecount (str): Number of ambiguous bases + - obs_taxon (str): Comma-separated list of observed taxa + - identification (str): Expected taxon identification + + Returns: + tuple: A pair containing: + - bool: True if sequence passes all criteria, False otherwise + - list: List of strings indicating which criteria failed (empty if all passed) + """ + failed_criteria = [] + logging.debug(f"Checking sequence {seq_id}") + + # Check nuc_basecount >= 500 + nuc_count = 0 + if metadata['nuc_basecount'] != 'None': + nuc_count = int(metadata['nuc_basecount']) + if nuc_count >= 500: + logging.info(f"Sequence {seq_id} passes nucleotide base count check ({nuc_count} bases)") + else: + logging.warning(f"Sequence {seq_id} fails nucleotide base count check ({nuc_count} bases)") + failed_criteria.append('nuc_basecount') + + # Check stop_codons == 0 + stop_count = int(metadata['stop_codons']) + if stop_count == 0: + logging.info(f"Sequence {seq_id} passes stop codons check (no stop codons)") + else: + logging.warning(f"Sequence {seq_id} fails stop codons check ({stop_count} stop codons)") + failed_criteria.append('stop_codons') + + # Check ambig_basecount <= 6 + ambig_count = 7 + if metadata['ambig_basecount'] != 'None': + ambig_count = int(metadata['ambig_basecount']) + if ambig_count <= 6: + logging.info(f"Sequence {seq_id} passes ambiguous base count check ({ambig_count} ambiguous bases)") + else: + logging.warning(f"Sequence {seq_id} fails ambiguous base count check ({ambig_count} ambiguous bases)") + failed_criteria.append('ambig_basecount') + + # Check if identification is in obs_taxon + obs_taxa = {taxon.strip() for taxon in metadata['obs_taxon'].split(',')} + if metadata['identification'] in obs_taxa: + logging.info(f"Sequence {seq_id} passes taxonomy check (identification: {metadata['identification']})") + else: + logging.warning( + f"Sequence {seq_id} fails taxonomy check (identification '{metadata['identification']}' " + f"not found in observed taxa: {', '.join(obs_taxa)})" + ) + failed_criteria.append('taxonomy') + + logging.debug( + f"Sequence {seq_id} check complete. Failed criteria: {', '.join(failed_criteria) if failed_criteria else 'none'}") + return len(failed_criteria) == 0, failed_criteria + + +def process_sequences(args): + """ + Process sequences and write to appropriate output directories. + + Args: + args (argparse.Namespace): Command line arguments containing: + - tsv (str): Path to input TSV file + - fasta (str): Path to input FASTA file + - good_dir (str): Path to output directory for good sequences + - bad_dir (str): Path to output directory for bad sequences + + Side effects: + - Creates output directories if they don't exist + - Writes filtered FASTA and TSV files to good_dir and bad_dir + - Logs processing results using the logging module + """ + # Create output directories if they don't exist + Path(args.good_dir).mkdir(parents=True, exist_ok=True) + Path(args.bad_dir).mkdir(parents=True, exist_ok=True) + logging.debug(f"Created output directories: {args.good_dir}, {args.bad_dir}") + + # Read TSV data + metadata = read_tsv_data(args.tsv) + + # Prepare output files + good_fasta = os.path.join(args.good_dir, os.path.basename(args.fasta)) + bad_fasta = os.path.join(args.bad_dir, os.path.basename(args.fasta)) + good_tsv = os.path.join(args.good_dir, os.path.basename(args.tsv)) + bad_tsv = os.path.join(args.bad_dir, os.path.basename(args.tsv)) + + logging.debug(f"Output files prepared:\n" + f"Good FASTA: {good_fasta}\n" + f"Bad FASTA: {bad_fasta}\n" + f"Good TSV: {good_tsv}\n" + f"Bad TSV: {bad_tsv}") + + # Process sequences + good_records = [] + bad_records = [] + good_metadata = [] + bad_metadata = [] + + logging.info(f"Starting sequence processing from {args.fasta}") + for record in SeqIO.parse(args.fasta, 'fasta'): + seq_id = record.id.split('_')[0] + logging.debug(f"Processing sequence: {seq_id}") + + if seq_id not in metadata: + logging.error(f"No metadata found for sequence {seq_id}") + continue + try: + passes_check, failed_criteria = check_sequence(seq_id, metadata[seq_id]) + except KeyError as e: + logging.error(f"Error processing sequence {seq_id}: {e} in {args.tsv}") + continue + + if passes_check: + good_records.append(record) + good_metadata.append(metadata[seq_id]) + else: + bad_records.append(record) + bad_metadata.append(metadata[seq_id]) + + # Write FASTA files + SeqIO.write(good_records, good_fasta, 'fasta') + SeqIO.write(bad_records, bad_fasta, 'fasta') + logging.debug(f"Written FASTA files:\n" + f"Good sequences: {good_fasta} ({len(good_records)} records)\n" + f"Bad sequences: {bad_fasta} ({len(bad_records)} records)") + + # Write TSV files + fieldnames = next(csv.DictReader(open(args.tsv), delimiter='\t')).keys() + + def write_tsv(filename, data): + with open(filename, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') + writer.writeheader() + writer.writerows(data) + logging.debug(f"Written TSV file: {filename} ({len(data)} records)") + + write_tsv(good_tsv, good_metadata) + write_tsv(bad_tsv, bad_metadata) + + logging.info( + f"Processing complete. Processed {len(good_records)} good sequences and {len(bad_records)} bad sequences") + + +def main(): + """ + Main function to run the script. + + This function: + 1. Parses command line arguments + 2. Sets up logging configuration based on verbosity level + 3. Processes and triages sequences based on quality criteria + """ + args = parse_args() + setup_logging(args.verbose) + process_sequences(args) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/tsv_explorer/app.R b/examples/tsv_explorer/app.R index f74bd9d..fbfcfea 100644 --- a/examples/tsv_explorer/app.R +++ b/examples/tsv_explorer/app.R @@ -5,6 +5,7 @@ library(dplyr) library(readr) library(httr) library(jsonlite) +library(utils) # Define a mapping of column names to prettier terms pretty_names <- c( @@ -16,8 +17,8 @@ pretty_names <- c( ) # Function to get TSV files from GitHub -get_github_tsv_files <- function() { - url <- "https://api.github.com/repos/naturalis/barcode_validator/contents/data?ref=pr-43" +get_github_tsv_files <- function(organization) { + url <- paste0("https://api.github.com/repos/naturalis/barcode_validator/contents/data?ref=", organization) response <- GET(url) content <- content(response, "text") files <- fromJSON(content) @@ -31,6 +32,9 @@ ui <- fluidPage( sidebarLayout( sidebarPanel( + selectInput("organization", "Select Organization", + choices = c("naturalis", "nhm", "unifi"), + selected = "naturalis"), selectizeInput("files", "Select TSV File(s)", choices = NULL, multiple = TRUE), @@ -51,9 +55,9 @@ ui <- fluidPage( # Server server <- function(input, output, session) { - # Populate file list on app start + # Update file list when organization changes observe({ - tsv_files <- get_github_tsv_files() + tsv_files <- get_github_tsv_files(input$organization) updateSelectizeInput(session, "files", choices = tsv_files) }) @@ -61,55 +65,89 @@ server <- function(input, output, session) { req(input$files) all_data <- lapply(input$files, function(file) { - url <- paste0("https://raw.githubusercontent.com/naturalis/barcode_validator/pr-43/data/", file) - df <- read_tsv(url, na = c("", "NA", "None")) - - # Convert relevant columns to numeric, replacing 'None' with NA - numeric_cols <- names(pretty_names) + encoded_file <- URLencode(file, reserved = TRUE) + url <- paste0("https://raw.githubusercontent.com/naturalis/barcode_validator/", + input$organization, "/data/", encoded_file) + + tryCatch({ + df <- read_tsv(url, na = c("", "NA", "None")) + + # Convert relevant columns to numeric, replacing 'None' with NA + numeric_cols <- names(pretty_names) + df <- df %>% + mutate(across(all_of(numeric_cols), ~as.numeric(as.character(.)))) + + return(df) + }, error = function(e) { + warning(paste("Error reading file:", file, "-", e$message)) + return(NULL) + }) + }) - df <- df %>% - mutate(across(all_of(numeric_cols), ~as.numeric(as.character(.)))) + # Remove NULL entries from failed reads + all_data <- all_data[!sapply(all_data, is.null)] - return(df) - }) + if (length(all_data) == 0) { + return(NULL) + } bind_rows(all_data) }) output$histogram <- renderPlot({ req(data()) + if (nrow(data()) == 0) return(NULL) + ggplot(data(), aes_string(x = input$field)) + geom_histogram(binwidth = function(x) diff(range(x, na.rm = TRUE))/30) + theme_minimal() + labs(title = paste("Histogram of", pretty_names[input$field]), x = pretty_names[input$field], y = "Count") + - scale_x_continuous(labels = scales::comma) # Format x-axis labels + scale_x_continuous(labels = scales::comma) }) output$summary_stats <- renderUI({ req(data()) - - nuc_ambig_stat <- data() %>% - summarise(percentage = mean(nuc_basecount >= 500 & ambig_basecount <= 6, na.rm = TRUE) * 100) %>% - pull(percentage) - - id_obs_stat <- data() %>% - rowwise() %>% - mutate(match = !is.na(identification) & !is.na(obs_taxon) & - identification %in% strsplit(obs_taxon, ",")[[1]]) %>% - ungroup() %>% - summarise(percentage = mean(match, na.rm = TRUE) * 100) %>% - pull(percentage) - - HTML(paste( - "
No valid data available for analysis.
")) + } + + tryCatch({ + # Calculate BIN compliance + nuc_ambig_stat <- data() %>% + summarise(percentage = mean(nuc_basecount >= 500 & ambig_basecount <= 6, na.rm = TRUE) * 100) %>% + pull(percentage) + + # Calculate taxonomy match percentage with safer handling + id_obs_stat <- data() %>% + rowwise() %>% + mutate( + match = tryCatch({ + if (is.na(identification) || is.na(obs_taxon)) { + FALSE + } else { + obs_taxa <- strsplit(obs_taxon, ",")[[1]] + identification %in% obs_taxa + } + }, error = function(e) FALSE) + ) %>% + ungroup() %>% + summarise(percentage = mean(match, na.rm = TRUE) * 100) %>% + pull(percentage) + + # Format output + HTML(paste( + "Error calculating statistics:", htmlEscape(as.character(e)), "
")) + }) }) }