diff --git a/Cargo.toml b/Cargo.toml index 2201637..258072d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,4 @@ num_cpus = "1.16.0" rayon = "1.9.0" serde = { version = "1.0.197", features = ["derive"] } noodles = { version = "0.66.0", features = ["bgzf"] } +regex = "1.10.4" diff --git a/src/impg.rs b/src/impg.rs index a289ae7..6c23e0c 100644 --- a/src/impg.rs +++ b/src/impg.rs @@ -7,6 +7,7 @@ use std::io::{Read, SeekFrom, Seek}; use std::fs::File; use rayon::prelude::*; use noodles::bgzf; +use regex::Regex; /// Parse a CIGAR string into a vector of CigarOp // Note that the query_delta is negative for reverse strand alignments @@ -100,7 +101,7 @@ impl QueryMetadata { } } -pub type QueryInterval = (Interval, Vec); +pub type AdjustedInterval = (Interval, Vec, Interval); type TreeMap = HashMap>; pub type SerializableImpg = (HashMap>, SequenceIndex); @@ -131,8 +132,8 @@ impl Impg { let mut seq_index = SequenceIndex::new(); for record in records { - seq_index.get_or_insert_id(&record.query_name, Some(record.target_length as u64)); - seq_index.get_or_insert_id(&record.target_name, Some(record.target_length as u64)); + seq_index.get_or_insert_id(&record.query_name, Some(record.target_length)); + seq_index.get_or_insert_id(&record.target_name, Some(record.target_length)); } let intervals: HashMap>> = records.par_iter() @@ -206,9 +207,9 @@ impl Impg { Self { trees, seq_index, paf_file: paf_file.to_string(), paf_gzi_index } } - pub fn query(&self, target_id: u32, range_start: i32, range_end: i32) -> Vec { + pub fn query(&self, target_id: u32, range_start: i32, range_end: i32) -> Vec { let mut results = Vec::new(); - // add the query interval to the results + // add the input range to the results results.push(( Interval { first: range_start, @@ -216,11 +217,17 @@ impl Impg { metadata: target_id, }, vec![CigarOp::new(range_end - range_start, '=')], + Interval { + first: range_start, + last: range_end, + metadata: 0 + } )); if let Some(tree) = self.trees.get(&target_id) { tree.query(range_start, range_end, |interval| { let metadata = &interval.metadata; - let (adjusted_start, adjusted_end, adjusted_cigar) = project_target_range_through_alignment( + let (adjusted_query_start, adjusted_query_end, adjusted_cigar, adjusted_target_start, adjusted_target_end) = + project_target_range_through_alignment( (range_start, range_end), (metadata.target_start, metadata.target_end, metadata.query_start, metadata.query_end, metadata.strand), &metadata.get_cigar_ops(&self.paf_file, self.paf_gzi_index.as_ref()) @@ -228,11 +235,16 @@ impl Impg { let adjusted_interval = ( Interval { - first: adjusted_start, - last: adjusted_end, - metadata: metadata.query_id, + first: adjusted_query_start, + last: adjusted_query_end, + metadata: metadata.query_id }, adjusted_cigar, + Interval { + first: adjusted_target_start, + last: adjusted_target_end, + metadata: 0 + } ); results.push(adjusted_interval); }); @@ -240,16 +252,21 @@ impl Impg { results } - pub fn query_transitive(&self, target_id: u32, range_start: i32, range_end: i32) -> Vec { + pub fn query_transitive(&self, target_id: u32, range_start: i32, range_end: i32) -> Vec { let mut results = Vec::new(); - // add the query interval to the results + // add the input range to the results results.push(( Interval { first: range_start, last: range_end, metadata: target_id, }, - vec![CigarOp::new(range_end - range_start, '=')] + vec![CigarOp::new(range_end - range_start, '=')], + Interval { + first: range_start, + last: range_end, + metadata: 0 + } )); let mut stack = vec![(target_id, range_start, range_end)]; let mut visited = HashSet::new(); @@ -258,24 +275,30 @@ impl Impg { if let Some(tree) = self.trees.get(¤t_target) { tree.query(current_start, current_end, |interval| { let metadata = &interval.metadata; - let (adjusted_start, adjusted_end, adjusted_cigar) = project_target_range_through_alignment( + let (adjusted_query_start, adjusted_query_end, adjusted_cigar, adjusted_target_start, adjusted_target_end) = + project_target_range_through_alignment( (current_start, current_end), (metadata.target_start, metadata.target_end, metadata.query_start, metadata.query_end, metadata.strand), - &metadata.get_cigar_ops(&self.paf_file, self.paf_gzi_index.as_ref()) + &metadata.get_cigar_ops(&self.paf_file, self.paf_gzi_index.as_ref()) ); let adjusted_interval = ( Interval { - first: adjusted_start, - last: adjusted_end, - metadata: metadata.query_id, + first: adjusted_query_start, + last: adjusted_query_end, + metadata: metadata.query_id }, adjusted_cigar, + Interval { + first: adjusted_target_start, + last: adjusted_target_end, + metadata: 0 + } ); results.push(adjusted_interval); if metadata.query_id != current_target { - let todo_range = (metadata.query_id, adjusted_start, adjusted_end); + let todo_range = (metadata.query_id, adjusted_query_start, adjusted_query_end); if !visited.insert(todo_range) { stack.push(todo_range); } @@ -291,9 +314,9 @@ impl Impg { fn project_target_range_through_alignment( target_range: (i32, i32), record: (i32, i32, i32, i32, Strand), - cigar_ops: &[CigarOp], -) -> (i32, i32, Vec) { - let (target_start, _target_end, query_start, query_end, strand) = record; + cigar_ops: &[CigarOp] +) -> (i32, i32, Vec, i32, i32) { + let (target_start, target_end, query_start, query_end, strand) = record; let mut target_pos = target_start; let mut query_pos = if strand == Strand::Forward { query_start } else { query_end }; @@ -302,29 +325,38 @@ fn project_target_range_through_alignment( let mut projected_end: Option = None; let mut projected_cigar = Vec::new(); + let mut new_target_start: Option = None; + let mut new_target_end: Option = None; + for cigar_op in cigar_ops { // If the target position is past the end of the range, we can stop if target_pos > target_range.1 { break; } match (cigar_op.target_delta(), cigar_op.query_delta(strand)) { - (0, query_delta) => { // Insertion in query + (0, query_delta) => { // Insertion in query (deletions in target) if target_pos >= target_range.0 && target_pos <= target_range.1 { projected_start.get_or_insert(query_pos); - projected_end = Some(query_pos + - if target_pos <= target_range.1 { 0 } else { query_delta }); + projected_end = Some(query_pos + query_delta); projected_cigar.push(CigarOp::new(query_delta.abs(), 'I')); + + new_target_start.get_or_insert(target_pos); + new_target_end = Some(target_pos); } query_pos += query_delta; }, - (target_delta, 0) => { // Deletion in target + (target_delta, 0) => { // Deletion in query (insertions in target) let overlap_start = target_pos.max(target_range.0); let overlap_end = (target_pos + target_delta).min(target_range.1); if overlap_start < overlap_end { // There's an overlap projected_start.get_or_insert(query_pos); projected_end = Some(query_pos); // Deletion does not advance query position + projected_cigar.push(CigarOp::new(overlap_end - overlap_start, 'D')); + + new_target_start.get_or_insert(overlap_start); + new_target_end = Some(overlap_end); } target_pos += target_delta; @@ -332,7 +364,6 @@ fn project_target_range_through_alignment( (target_delta, query_delta) => { // Match or mismatch let overlap_start = target_pos.max(target_range.0); let overlap_end = (target_pos + target_delta).min(target_range.1); - if overlap_start < overlap_end { // There's an overlap let overlap_length = overlap_end - overlap_start; let dir = if strand == Strand::Forward { 1 } else { -1 }; @@ -343,6 +374,9 @@ fn project_target_range_through_alignment( projected_end = Some(query_overlap_end); projected_cigar.push(CigarOp::new(overlap_length, cigar_op.op())); + + new_target_start.get_or_insert(overlap_start); + new_target_end = Some(overlap_end); } target_pos += target_delta; @@ -350,11 +384,13 @@ fn project_target_range_through_alignment( }, } } - + ( projected_start.unwrap_or(query_start), - projected_end.unwrap_or(query_pos), - projected_cigar + (projected_end.unwrap_or(query_pos)).min(query_end), + projected_cigar, + new_target_start.unwrap_or(target_start), + (new_target_end.unwrap_or(target_pos)).min(target_end), ) } @@ -377,6 +413,91 @@ fn parse_cigar_to_delta(cigar: &str) -> Result, ParseErr> { Ok(ops) } +fn is_valid_cigar(cigar: &[CigarOp]) -> Result<(), String> { + let cigar_str: String = cigar.iter().map(|op| format!("{}{}", op.len(), op.op())).collect(); + + let re = Regex::new(r"^(\d+[MX=ID])+$").unwrap(); + if !re.is_match(&cigar_str) { + return Err("Invalid format: non-standard or not-yet-supported operations, or formatting errors detected.".to_string()); + } + + let mut last_type = None; + for op in cigar { + let op_type = op.op(); + if let Some(last) = last_type { + if "ISHP".contains(last) && "ISHP".contains(op_type) { + return Err(format!("Consecutive non-reference-consuming operations detected: {} followed by {}.", last, op_type)); + } + } + last_type = Some(op_type); + } + + Ok(()) +} + +fn parse_cigar(cigar: &[CigarOp]) -> (i32, i32) { + let (query_length, target_length) = cigar.iter().fold((0, 0), |(query_len, target_len), op| { + let len = op.len(); + match op.op() { + 'M' | 'X' | '=' | 'E' => (query_len + len, target_len + len), + 'I' | 'S' => (query_len + len, target_len), + 'D' | 'N' => (query_len, target_len + len), + _ => (query_len, target_len), + } + }); + (query_length, target_length) +} + +pub fn check_intervals(impg: &Impg, results: &Vec) -> Vec<(String, String)> { + let mut invalid = Vec::new(); + + for (overlap_query, cigar, overlap_target) in results { + let query_name = impg.seq_index.get_name(overlap_query.metadata).unwrap(); + let query_len = impg.seq_index.get_len_from_id(overlap_query.metadata).unwrap(); + let target_name = impg.seq_index.get_name(overlap_target.metadata).unwrap(); + let target_len = impg.seq_index.get_len_from_id(overlap_target.metadata).unwrap(); + + let (query_start, query_end) = (overlap_query.first, overlap_query.last); + let (target_start, target_end) = (overlap_target.first, overlap_target.last); + + let full_cigar: String = cigar.iter().map(|op| format!("{}{}", op.len(), op.op())).collect(); + let first_chunk_cigar = if full_cigar.len() > 20 { + format!("{}...", &full_cigar[..20]) + } else { + full_cigar + }; + + let (calc_query_len, calc_target_len) = parse_cigar(cigar); + + let mut error_details = Vec::new(); + if calc_query_len != (query_end - query_start).abs() { + error_details.push(format!("Query length mismatch: expected {} from the query range [{}-{}), got {} from the CIGAR string", (query_end - query_start).abs(), query_start, query_end, calc_query_len)); + } + if calc_target_len != (target_end - target_start).abs() { + error_details.push(format!("Target length mismatch: expected {} from the target range [{}-{}), got {} from the CIGAR string", (target_end - target_start).abs(), target_start, target_end, calc_target_len)); + } + + match is_valid_cigar(cigar) { + Ok(()) => { + if !error_details.is_empty() { + let error_reason = error_details.join("; "); + invalid.push((format!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", query_name, query_len, query_start, query_end, if query_start <= query_end { '+' } else { '-' }, target_name, target_len, target_start, target_end, first_chunk_cigar), error_reason)); + } + } + Err(error_msg) => { + let error_reason = if error_details.is_empty() { + error_msg + } else { + format!("{}; {}", error_msg, error_details.join("; ")) + }; + invalid.push((format!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", query_name, query_len, query_start, query_end, if query_start <= query_end { '+' } else { '-' }, target_name, target_len, target_start, target_end, first_chunk_cigar), error_reason)); + } + } + } + + invalid +} + #[cfg(test)] mod tests { use super::*; @@ -388,11 +509,9 @@ mod tests { let target_range = (100, 200); let record = (100, 200, 0, 100, Strand::Forward); let cigar_ops = vec![CigarOp::new(100, '=')]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); + let result = project_target_range_through_alignment(target_range, record, &cigar_ops); - assert_eq!(start, 0); - assert_eq!(end, 100); - assert_eq!(cigar, cigar_ops); + assert_eq!(result, (0, 100, cigar_ops.clone(), 100, 200)); } #[test] @@ -400,12 +519,9 @@ mod tests { let target_range = (100, 200); let record = (100, 200, 0, 100, Strand::Reverse); let cigar_ops = vec![CigarOp::new(100, '=')]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); - println!("Final result: ({}, {})", start, end); + let result = project_target_range_through_alignment(target_range, record, &cigar_ops); - assert_eq!(start, 100); - assert_eq!(end, 0); - assert_eq!(cigar, cigar_ops); + assert_eq!(result, (100, 0, cigar_ops.clone(), 100, 200)); } #[test] @@ -421,19 +537,19 @@ mod tests { let base = (0, 100, 50, 200, Strand::Forward); { let result = project_target_range_through_alignment((0, 100), base, &cigar_ops); - assert_eq!(result, (50, 200, cigar_ops.clone())); + assert_eq!(result, (50, 200, cigar_ops.clone(), 0, 100)); } { let result = project_target_range_through_alignment((50, 55), base, &cigar_ops); - assert_eq!(result, (100, 105, vec![CigarOp::new(5, '=')])); + assert_eq!(result, (100, 105, vec![CigarOp::new(5, '=')], 50, 55)); } { let result = project_target_range_through_alignment((50, 64), base, &cigar_ops); - assert_eq!(result, (100, 114, vec![CigarOp::new(14, '=')])); + assert_eq!(result, (100, 114, vec![CigarOp::new(14, '=')], 50, 64)); } { let result = project_target_range_through_alignment((65, 65), base, &cigar_ops); - assert_eq!(result, (115, 115, vec![CigarOp::new(50, 'I')])); + assert_eq!(result, (115, 165, vec![CigarOp::new(50, 'I')], 65, 65)); } { let result = project_target_range_through_alignment((50, 65), base, &cigar_ops); @@ -441,7 +557,7 @@ mod tests { CigarOp::new(15, '='), CigarOp::new(50, 'I') ]; - assert_eq!(result, (100, 115, cigar_ops)); + assert_eq!(result, (100, 165, cigar_ops, 50, 65)); } { let result = project_target_range_through_alignment((50, 66), base, &cigar_ops); @@ -450,11 +566,11 @@ mod tests { CigarOp::new(50, 'I'), CigarOp::new(1, '=') ]; - assert_eq!(result, (100, 166, cigar_ops)); + assert_eq!(result, (100, 166, cigar_ops, 50, 66)); } { let result = project_target_range_through_alignment((70, 95), base, &cigar_ops); - assert_eq!(result, (170, 195, vec![CigarOp::new(25, '=')])); + assert_eq!(result, (170, 195, vec![CigarOp::new(25, '=')], 70, 95)); } } @@ -464,8 +580,8 @@ mod tests { let target_range = (100, 200); let record = (100, 200, 100, 200, Strand::Forward); let cigar_ops = vec![CigarOp::new(100, '=')]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); - assert_eq!((start, end, cigar), (100, 200, vec![CigarOp::new(100, '=')])); + let (query_start, query_end, cigar, target_start, target_end) = project_target_range_through_alignment(target_range, record, &cigar_ops); + assert_eq!((query_start, query_end, cigar, target_start, target_end), (100, 200, vec![CigarOp::new(100, '=')], 100, 200)); } // 2. Simple Reverse Projection @@ -474,8 +590,8 @@ mod tests { let target_range = (100, 200); let record = (100, 200, 100, 200, Strand::Reverse); let cigar_ops = vec![CigarOp::new(100, '=')]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); - assert_eq!((start, end, cigar), (200, 100, vec![CigarOp::new(100, '=')])); // Adjust for reverse calculation + let (query_start, query_end, cigar, target_start, target_end) = project_target_range_through_alignment(target_range, record, &cigar_ops); + assert_eq!((query_start, query_end, cigar, target_start, target_end), (200, 100, vec![CigarOp::new(100, '=')], 100, 200)); // Adjust for reverse calculation } // 3. Forward Projection with Insertions @@ -488,7 +604,7 @@ mod tests { CigarOp::new(10, 'I'), // Insertion CigarOp::new(50, '='), // Match ]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); + let (start, end, cigar, _, _) = project_target_range_through_alignment(target_range, record, &cigar_ops); assert_eq!((start, end, cigar), (50, 160, cigar_ops)); } @@ -502,7 +618,7 @@ mod tests { CigarOp::new(10, 'D'), // Deletion CigarOp::new(40, '='), // Match ]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); + let (start, end, cigar, _, _) = project_target_range_through_alignment(target_range, record, &cigar_ops); assert_eq!((start, end, cigar), (50, 140, cigar_ops)); } @@ -517,7 +633,7 @@ mod tests { CigarOp::new(10, 'I'), // 150, 250 CigarOp::new(40, '='), // 150, 250 ]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); + let (start, end, cigar, _, _) = project_target_range_through_alignment(target_range, record, &cigar_ops); let cigar_ops = vec![ CigarOp::new(10, 'D'), // 150, 260 CigarOp::new(10, 'I'), // 150, 250 @@ -534,15 +650,14 @@ mod tests { let cigar_ops = vec![ CigarOp::new(10, '='), // Match CigarOp::new(20, 'D'), // Deletion in target - CigarOp::new(8, '='), // Match - CigarOp::new(1, 'X'), // Match - CigarOp::new(1, '='), // Match + CigarOp::new(8, '='), // Match + CigarOp::new(1, 'X'), // Match + CigarOp::new(1, '='), // Match CigarOp::new(10, 'I'), // Insertion in query CigarOp::new(10, '='), // Match ]; - let (start, end, cigar) = project_target_range_through_alignment(target_range, record, &cigar_ops); - println!("{} {}", start, end); - assert_eq!((start, end, cigar), (0, 10, vec![CigarOp::new(10, '=')])); + let (query_start, query_end, cigar, target_start, target_end) = project_target_range_through_alignment(target_range, record, &cigar_ops); + assert_eq!((query_start, query_end, cigar, target_start, target_end), (0, 10, vec![CigarOp::new(10, '=')], 0, 10)); } #[test] diff --git a/src/main.rs b/src/main.rs index 55db12b..4fe8df8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, BufReader, BufWriter}; use std::num::NonZeroUsize; use noodles::bgzf; -use impg::impg::{Impg, SerializableImpg, QueryInterval}; +use impg::impg::{Impg, SerializableImpg, AdjustedInterval, check_intervals}; use coitrees::IntervalTree; use impg::paf; use rayon::ThreadPoolBuilder; @@ -44,6 +44,10 @@ struct Args { /// Number of threads for parallel processing. #[clap(short='t', long, value_parser, default_value_t = NonZeroUsize::new(1).unwrap())] num_threads: NonZeroUsize, + + /// Check the projected intervals, reporting the wrong ones (slow, useful for debugging). + #[clap(short='c', long, action)] + check_intervals: bool, } fn main() -> io::Result<()> { @@ -65,8 +69,17 @@ fn main() -> io::Result<()> { if let Some(target_range) = args.target_range { let (target_name, target_range) = parse_target_range(&target_range)?; let results = perform_query(&impg, &target_name, target_range, args.transitive); + if args.check_intervals { + let invalid_cigars = check_intervals(&impg, &results); + if !invalid_cigars.is_empty() { + for (row, error_reason) in invalid_cigars { + eprintln!("{}; {}", error_reason, row); + } + panic!("Invalid intervals encountered."); + } + } if args.output_paf { - output_results_paf(&impg, results, &target_name, target_range, None); + output_results_paf(&impg, results, &target_name, None); } else { output_results_bed(&impg, results); } @@ -74,10 +87,19 @@ fn main() -> io::Result<()> { let targets = parse_bed_file(&target_bed)?; for (target_name, target_range, name) in targets { let results = perform_query(&impg, &target_name, target_range, args.transitive); + if args.check_intervals { + let invalid_cigars = check_intervals(&impg, &results); + if !invalid_cigars.is_empty() { + for (row, error_reason) in invalid_cigars { + eprintln!("{}; {}", error_reason, row); + } + panic!("Invalid intervals encountered."); + } + } if args.output_paf { - output_results_paf(&impg, results, &target_name, target_range, name); + output_results_paf(&impg, results, &target_name, name); } else { - output_results_bedpe(&impg, results, &target_name, target_range, name); + output_results_bedpe(&impg, results, &target_name, name); } } } @@ -87,7 +109,7 @@ fn main() -> io::Result<()> { fn parse_bed_file(bed_file: &str) -> io::Result)>> { let file = File::open(bed_file)?; let reader = BufReader::new(file); - let mut queries = Vec::new(); + let mut ranges = Vec::new(); for line in reader.lines() { let line = line?; @@ -96,20 +118,38 @@ fn parse_bed_file(bed_file: &str) -> io::Result().map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid start value"))?; - let end = parts[2].parse::().map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid end value"))?; + let (start, end) = parse_range(&parts[1..=2])?; + let name = parts.get(3).map(|s| s.to_string()); + ranges.push((parts[0].to_string(), (start, end), name)); + } - if start >= end { - return Err(io::Error::new(io::ErrorKind::InvalidInput, "Start value must be less than end value")); - } + Ok(ranges) +} - let name = if parts.len() > 3 { Some(parts[3].to_string()) } else { None }; - queries.push((parts[0].to_string(), (start, end), name)); +fn parse_target_range(target_range: &str) -> io::Result<(String, (i32, i32))> { + let parts: Vec<&str> = target_range.split(':').collect(); + if parts.len() != 2 { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "Target range format should be `seq_name:start-end`")); } - Ok(queries) + let (start, end) = parse_range(&parts[1].split('-').collect::>())?; + Ok((parts[0].to_string(), (start, end))) } +fn parse_range(range_parts: &[&str]) -> io::Result<(i32, i32)> { + if range_parts.len() != 2 { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "Range format should be `start-end`")); + } + + let start = range_parts[0].parse::().map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid start value"))?; + let end = range_parts[1].parse::().map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid end value"))?; + + if start >= end { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "Start value must be less than end value")); + } + + Ok((start, end)) +} fn load_or_generate_index(paf_file: &str, num_threads: NonZeroUsize) -> io::Result { let index_file = format!("{}.impg", paf_file); @@ -148,27 +188,7 @@ fn load_index(paf_file: &str) -> io::Result { Ok(Impg::from_paf_and_serializable(paf_file, serializable)) } -fn parse_target_range(target_range: &str) -> io::Result<(String, (i32, i32))> { - let parts: Vec<&str> = target_range.split(':').collect(); - if parts.len() != 2 { - return Err(io::Error::new(io::ErrorKind::InvalidInput, "Target range format should be `seq_name:start-end`")); - } - let range_parts: Vec<&str> = parts[1].split('-').collect(); - if range_parts.len() != 2 { - return Err(io::Error::new(io::ErrorKind::InvalidInput, "Target range format should be `start-end`")); - } - - let start = range_parts[0].parse::().map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid start value"))?; - let end = range_parts[1].parse::().map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid end value"))?; - - if start >= end { - return Err(io::Error::new(io::ErrorKind::InvalidInput, "Start value must be less than end value")); - } - - Ok((parts[0].to_string(), (start, end))) -} - -fn perform_query(impg: &Impg, target_name: &str, target_range: (i32, i32), transitive: bool) -> Vec { +fn perform_query(impg: &Impg, target_name: &str, target_range: (i32, i32), transitive: bool) -> Vec { let (target_start, target_end) = target_range; let target_id = impg.seq_index.get_id(target_name).expect("Target name not found in index"); if transitive { @@ -178,8 +198,8 @@ fn perform_query(impg: &Impg, target_name: &str, target_range: (i32, i32), trans } } -fn output_results_bed(impg: &Impg, results: Vec) { - for (overlap, _) in results { +fn output_results_bed(impg: &Impg, results: Vec) { + for (overlap, _, _) in results { let overlap_name = impg.seq_index.get_name(overlap.metadata).unwrap(); let (first, last, strand) = if overlap.first <= overlap.last { (overlap.first, overlap.last, '+') @@ -190,32 +210,32 @@ fn output_results_bed(impg: &Impg, results: Vec) { } } -fn output_results_bedpe(impg: &Impg, results: Vec, target_name: &str, target_range: (i32, i32), name: Option) { - for (overlap, _) in results { - let overlap_name = impg.seq_index.get_name(overlap.metadata).unwrap(); - let (first, last, strand) = if overlap.first <= overlap.last { - (overlap.first, overlap.last, '+') +fn output_results_bedpe(impg: &Impg, results: Vec, target_name: &str, name: Option) { + for (overlap_query, _, overlap_target) in results { + let overlap_name = impg.seq_index.get_name(overlap_query.metadata).unwrap(); + let (first, last, strand) = if overlap_query.first <= overlap_query.last { + (overlap_query.first, overlap_query.last, '+') } else { - (overlap.last, overlap.first, '-') + (overlap_query.last, overlap_query.first, '-') }; println!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t0\t{}\t+", overlap_name, first, last, - target_name, target_range.0, target_range.1, + target_name, overlap_target.first, overlap_target.last, name.as_deref().unwrap_or("."), strand); } } -fn output_results_paf(impg: &Impg, results: Vec, target_name: &str, target_range: (i32, i32), name: Option) { +fn output_results_paf(impg: &Impg, results: Vec, target_name: &str, name: Option) { let target_length = impg.seq_index.get_len_from_id(impg.seq_index.get_id(target_name).unwrap()).unwrap(); - for (overlap, cigar) in results { - let overlap_name = impg.seq_index.get_name(overlap.metadata).unwrap(); - let (first, last, strand) = if overlap.first <= overlap.last { - (overlap.first, overlap.last, '+') + for (overlap_query, cigar, overlap_target) in results { + let overlap_name = impg.seq_index.get_name(overlap_query.metadata).unwrap(); + let (first, last, strand) = if overlap_query.first <= overlap_query.last { + (overlap_query.first, overlap_query.last, '+') } else { - (overlap.last, overlap.first, '-') + (overlap_query.last, overlap_query.first, '-') }; - let query_length = impg.seq_index.get_len_from_id(overlap.metadata).unwrap(); + let query_length = impg.seq_index.get_len_from_id(overlap_query.metadata).unwrap(); let has_m_operation = cigar.iter().any(|op| op.op() == 'M'); let (matches, block_len) = if has_m_operation { @@ -238,23 +258,21 @@ fn output_results_paf(impg: &Impg, results: Vec, target_name: &st } }) }; - let cigar_str : String = cigar.iter().map(|op| format!("{}{}", op.len(), op.op())).collect(); match name { Some(ref name) => println!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tcg:Z:{}\tan:Z:{}", overlap_name, query_length, first, last, strand, - target_name, target_length, target_range.0, target_range.1, + target_name, target_length, overlap_target.first, overlap_target.last, matches, block_len, 255, cigar_str, name), None => println!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tcg:Z:{}", - overlap_name, query_length, overlap.first, overlap.last, strand, - target_name, target_length, target_range.0, target_range.1, + overlap_name, query_length, first, last, strand, + target_name, target_length, overlap_target.first, overlap_target.last, matches, block_len, 255, cigar_str), } } } - fn print_stats(impg: &Impg) { println!("Number of sequences: {}", impg.seq_index.len()); println!("Number of overlaps: {}", impg.trees.values().map(|tree| tree.len()).sum::()); diff --git a/src/seqidx.rs b/src/seqidx.rs index 22681b5..b947c5d 100644 --- a/src/seqidx.rs +++ b/src/seqidx.rs @@ -5,7 +5,7 @@ use serde::{Serialize, Deserialize}; pub struct SequenceIndex { name_to_id: HashMap, id_to_name: HashMap, - id_to_len: HashMap, + id_to_len: HashMap, next_id: u32, } @@ -19,7 +19,7 @@ impl SequenceIndex { } } - pub fn get_or_insert_id(&mut self, name: &str, length: Option) -> u32 { + pub fn get_or_insert_id(&mut self, name: &str, length: Option) -> u32 { let id = *self.name_to_id.entry(name.to_owned()).or_insert_with(|| { let id = self.next_id; self.id_to_name.insert(id, name.to_owned()); @@ -42,7 +42,7 @@ impl SequenceIndex { self.id_to_name.get(&id).map(|s| s.as_str()) } - pub fn get_len_from_id(&self, id: u32) -> Option { + pub fn get_len_from_id(&self, id: u32) -> Option { self.id_to_len.get(&id).copied() }