From 8fe60976148b9e726a8bdce738817855f6b0978a Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 24 Jan 2025 13:47:46 -0600 Subject: [PATCH] Add min_distance_between_ranges parameter to query functions for better range management --- src/impg.rs | 47 +++++++++++++++++++++++++++++++++++++++++------ src/main.rs | 23 +++++++++++++++++------ src/partition.rs | 3 ++- 3 files changed, 60 insertions(+), 13 deletions(-) diff --git a/src/impg.rs b/src/impg.rs index a55ac1e..0d3b212 100644 --- a/src/impg.rs +++ b/src/impg.rs @@ -372,6 +372,7 @@ impl Impg { masked_regions: Option<&FxHashMap>, max_depth: u16, min_interval_size: u32, + min_distance_between_ranges: u32, ) -> Vec { let mut results = Vec::new(); // Add the input range to the results @@ -440,14 +441,48 @@ impl Impg { // Only add non-overlapping portions to the stack for further exploration if metadata.query_id != current_target_id { let ranges = visited_ranges.entry(metadata.query_id) - .or_default(); // Note the closure here + .or_default(); + + let mut should_add = true; + + if min_distance_between_ranges > 0 { + let (new_min, new_max) = if adjusted_query_start <= adjusted_query_end { + (adjusted_query_start, adjusted_query_end) + } else { + (adjusted_query_end, adjusted_query_start) + }; + + // Find insertion point in sorted ranges + let idx = match ranges.ranges.binary_search_by_key(&new_min, |&(start, _)| start) { + Ok(i) => i, + Err(i) => i, + }; + + // Only need to check adjacent ranges due to sorting + if idx > 0 { + // Check previous range + let (_, prev_end) = ranges.ranges[idx - 1]; + if ((new_min - prev_end).abs() as u32) < min_distance_between_ranges { + should_add = false; + } + } + if should_add && idx < ranges.ranges.len() { + // Check next range + let (next_start, _) = ranges.ranges[idx]; + if ((next_start - new_max).abs() as u32) < min_distance_between_ranges { + should_add = false; + } + } + } - let new_ranges = ranges.insert((adjusted_query_start, adjusted_query_end)); + if should_add { + let new_ranges = ranges.insert((adjusted_query_start, adjusted_query_end)); - // Add non-overlapping portions to stack - for (new_start, new_end) in new_ranges { - if (new_end - new_start).abs() as u32 >= min_interval_size { - stack.push((metadata.query_id, new_start, new_end, current_depth + 1)); + // Add non-overlapping portions to stack + for (new_start, new_end) in new_ranges { + if ((new_end - new_start).abs() as u32) >= min_interval_size { + stack.push((metadata.query_id, new_start, new_end, current_depth + 1)); + } } } } diff --git a/src/main.rs b/src/main.rs index 20841bf..ab03e02 100644 --- a/src/main.rs +++ b/src/main.rs @@ -63,6 +63,10 @@ enum Args { /// Minimum size of intervals to consider for transitive queries #[clap(long, value_parser, default_value_t = 0)] min_interval_size: u32, + + /// Minimum distance between transitive ranges to consider on the same sequence + #[clap(long, value_parser, default_value_t = 0)] + min_distance_between_ranges: u32, }, /// Query overlaps in the alignment. Query { @@ -88,6 +92,10 @@ enum Args { /// Minimum size of intervals to consider for transitive queries #[clap(long, value_parser, default_value_t = 0)] min_interval_size: u32, + + /// Minimum distance between transitive ranges to consider on the same sequence + #[clap(long, value_parser, default_value_t = 0)] + min_distance_between_ranges: u32, /// Output results in PAF format. #[clap(short = 'P', long, action)] @@ -116,9 +124,10 @@ fn main() -> io::Result<()> { merge_distance, max_depth, min_interval_size, + min_distance_between_ranges, } => { let impg = initialize_impg(&common)?; - partition_alignments(&impg, window_size, &sequence_prefix, min_length, merge_distance, max_depth, min_interval_size, common.verbose > 1)?; + partition_alignments(&impg, window_size, &sequence_prefix, min_length, merge_distance, max_depth, min_interval_size, min_distance_between_ranges, common.verbose > 1)?; }, Args::Query { common, @@ -128,13 +137,14 @@ fn main() -> io::Result<()> { output_paf, check_intervals, max_depth, - min_interval_size + min_interval_size, + min_distance_between_ranges } => { let impg = initialize_impg(&common)?; if let Some(target_range) = target_range { let (target_name, target_range) = parse_target_range(&target_range)?; - let results = perform_query(&impg, &target_name, target_range, transitive, max_depth, min_interval_size); + let results = perform_query(&impg, &target_name, target_range, transitive, max_depth, min_interval_size, min_distance_between_ranges); if check_intervals { let invalid_cigars = impg::impg::check_intervals(&impg, &results); if !invalid_cigars.is_empty() { @@ -155,7 +165,7 @@ fn main() -> io::Result<()> { let targets = parse_bed_file(&target_bed)?; info!("Parsed {} target ranges from BED file", targets.len()); for (target_name, target_range, name) in targets { - let results = perform_query(&impg, &target_name, target_range, transitive, max_depth, min_interval_size); + let results = perform_query(&impg, &target_name, target_range, transitive, max_depth, min_interval_size, min_distance_between_ranges); if check_intervals { let invalid_cigars = impg::impg::check_intervals(&impg, &results); if !invalid_cigars.is_empty() { @@ -316,7 +326,8 @@ fn perform_query( target_range: (i32, i32), transitive: bool, max_depth: u16, - min_interval_size: u32 + min_interval_size: u32, + min_distance_between_ranges: u32 ) -> Vec { let (target_start, target_end) = target_range; let target_id = impg.seq_index.get_id(target_name).expect("Target name not found in index"); @@ -325,7 +336,7 @@ fn perform_query( panic!("Target range end ({}) exceeds the target sequence length ({})", target_end, target_length); } if transitive { - impg.query_transitive(target_id, target_start, target_end, None, max_depth, min_interval_size) + impg.query_transitive(target_id, target_start, target_end, None, max_depth, min_interval_size, min_distance_between_ranges) } else { impg.query(target_id, target_start, target_end) } diff --git a/src/partition.rs b/src/partition.rs index 637de2f..5bffe35 100644 --- a/src/partition.rs +++ b/src/partition.rs @@ -16,6 +16,7 @@ pub fn partition_alignments( merge_distance: usize, max_depth: u16, min_interval_size: u32, + min_distance_between_ranges: u32, debug: bool, ) -> io::Result<()> { // Get all sequences with the given prefix @@ -115,7 +116,7 @@ pub fn partition_alignments( // Query overlaps for current window //let query_start = Instant::now(); - let mut overlaps = impg.query_transitive(*seq_id, *start, *end, Some(&masked_regions), max_depth, min_interval_size); + let mut overlaps = impg.query_transitive(*seq_id, *start, *end, Some(&masked_regions), max_depth, min_interval_size, min_distance_between_ranges); //let query_time = query_start.elapsed(); debug!(" Collected {} query overlaps", overlaps.len());