diff --git a/.vscode/settings.json b/.vscode/settings.json index e1748f1..b3d1136 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -116,6 +116,7 @@ "Psicose", "Psimod", "psms", + "psmtsv", "pyclass", "pymethods", "pymodule", diff --git a/Cargo.toml b/Cargo.toml index ea9ecd7..17d00d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,8 @@ members = [ "examples/*", ] resolver = "2" +package.edition = "2021" +package.version = "0.9.0-alpha.1" [profile.release] debug = true diff --git a/examples/de-novo-align/Cargo.toml b/examples/de-novo-align/Cargo.toml new file mode 100644 index 0000000..619f806 --- /dev/null +++ b/examples/de-novo-align/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "de-novo-align" +version = "0.1.0" +publish = false +edition.workspace = true + +[dependencies] +rustyms = { path = "../../rustyms" } +clap = { workspace = true, features = ["derive", "cargo"] } +itertools = { workspace = true } +rayon = { workspace = true } +serde_json = { workspace = true } diff --git a/examples/de-novo-align/src/main.rs b/examples/de-novo-align/src/main.rs new file mode 100644 index 0000000..e74c98e --- /dev/null +++ b/examples/de-novo-align/src/main.rs @@ -0,0 +1,36 @@ +use std::{ + fs::File, + io::{BufReader, BufWriter}, +}; + +use clap::Parser; +use fragment::FragmentType; +use identification::{open_identified_peptides_file, FastaData}; +use itertools::Itertools; +use rayon::prelude::*; +use rustyms::{ + spectrum::{Score, Scores}, + system::{e, usize::Charge}, + *, +}; +use spectrum::PeakSpectrum; +use std::collections::HashMap; + +#[derive(Parser)] +struct Cli { + /// The input identified peptides file + #[arg(short, long)] + peptides: String, + /// The fasta database of known proteins + #[arg(short, long)] + database: String, + /// Where to store the results + #[arg(long)] + out_path: String, +} + +fn main() { + let args = Cli::parse(); + let peptides = open_identified_peptides_file(args.peptides, None).unwrap(); + let database = FastaData::parse_file(args.database).unwrap(); +} diff --git a/examples/multi-annotator/Cargo.toml b/examples/multi-annotator/Cargo.toml index 8619b13..f78a912 100644 --- a/examples/multi-annotator/Cargo.toml +++ b/examples/multi-annotator/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "multi-annotator" version = "0.1.0" -edition = "2021" +edition.workspace = true publish = false [dependencies] diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 8fb895c..c1d52b2 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -2,7 +2,7 @@ name = "rustyms-fuzz" version = "0.0.0" publish = false -edition = "2021" +edition.workspace = true [package.metadata] cargo-fuzz = true diff --git a/rustyms-imgt-generate/Cargo.toml b/rustyms-imgt-generate/Cargo.toml index 76842b6..a7658de 100644 --- a/rustyms-imgt-generate/Cargo.toml +++ b/rustyms-imgt-generate/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rustyms-imgt-generate" version = "0.1.0" -edition = "2021" +edition.workspace = true license = "MIT OR Apache-2.0" authors = ["Douwe Schulte "] rust-version = "1.70.0" diff --git a/rustyms-py/Cargo.toml b/rustyms-py/Cargo.toml index 292668d..7fbeceb 100644 --- a/rustyms-py/Cargo.toml +++ b/rustyms-py/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rustyms-py" -version = "0.9.0-alpha.1" -edition = "2021" +version.workspace = true +edition.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] diff --git a/rustyms/Cargo.toml b/rustyms/Cargo.toml index b6bf0c5..e9f1c9b 100644 --- a/rustyms/Cargo.toml +++ b/rustyms/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rustyms" -version = "0.9.0-alpha.1" -edition = "2021" +version.workspace = true +edition.workspace = true license = "MIT OR Apache-2.0" authors = ["Douwe Schulte "] description = "A library to handle proteomic mass spectrometry data and match peptides to spectra." diff --git a/rustyms/src/identification/fasta.rs b/rustyms/src/identification/fasta.rs index 0b8c4fe..ff6bbbd 100644 --- a/rustyms/src/identification/fasta.rs +++ b/rustyms/src/identification/fasta.rs @@ -4,7 +4,10 @@ use crate::{ CompoundPeptidoform, LinearPeptide, SequenceElement, }; use serde::{Deserialize, Serialize}; -use std::io::{BufRead, BufReader}; +use std::{ + io::{BufRead, BufReader}, + path::Path, +}; use super::{IdentifiedPeptide, MetaData}; @@ -21,12 +24,13 @@ impl FastaData { /// Parse a single fasta file /// # Errors /// A custom error when it is not a valid fasta file - pub fn parse_file(path: &str) -> Result, CustomError> { + pub fn parse_file(path: impl AsRef) -> Result, CustomError> { + let path = path.as_ref(); let file = std::fs::File::open(path).map_err(|_| { CustomError::error( "Failed reading fasta file", "Error occurred while opening the file", - Context::show(path), + Context::show(path.to_string_lossy()), ) })?; let reader = BufReader::new(file); @@ -39,7 +43,7 @@ impl FastaData { CustomError::error( "Failed reading fasta file", format!("Error occurred while reading line {}", line_index + 1), - Context::show(path), + Context::show(path.to_string_lossy()), ) })?; #[allow(clippy::manual_strip)] diff --git a/rustyms/src/identification/general.rs b/rustyms/src/identification/general.rs new file mode 100644 index 0000000..df2f814 --- /dev/null +++ b/rustyms/src/identification/general.rs @@ -0,0 +1,69 @@ +use std::path::Path; + +use super::{ + error::{Context, CustomError}, + ontologies::CustomDatabase, + FastaData, IdentifiedPeptide, IdentifiedPeptideIter, IdentifiedPeptideSource, MSFraggerData, + MaxQuantData, NovorData, OpairData, PeaksData, SageData, +}; + +/// Open the selected path and automatically determine the file type. +/// # Errors +/// It errors if the file type could not be determined or if opening the file errors. +pub fn open_identified_peptides_file<'a>( + path: impl AsRef, + custom_database: Option<&'a CustomDatabase>, +) -> Result> + 'a>, CustomError> { + let path = path.as_ref(); + let actual_extension = path + .extension() + .map(|ex| { + (ex == "gz") + .then_some(path) + .and_then(|p| p.file_stem()) + .and_then(|p| Path::new(p).extension()) + .unwrap_or(ex) + }) + .map(|ex| ex.to_string_lossy().to_lowercase()); + match actual_extension.as_deref() { + Some("csv") => PeaksData::parse_file(path, custom_database) + .map(IdentifiedPeptideIter::into_box) + .or_else(|_| { + NovorData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box) + }) + .map_err(|_| { + CustomError::error( + "Unknown file", + "Could not be recognised as either a Peaks or Novor file", + Context::show(path.to_string_lossy()), + ) + }), + Some("tsv") => MSFraggerData::parse_file(path, custom_database) + .map(IdentifiedPeptideIter::into_box) + .or_else(|_| { + SageData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box) + }) + .map_err(|_| { + CustomError::error( + "Unknown file", + "Could not be recognised as either a MSFragger or Sage file", + Context::show(path.to_string_lossy()), + ) + }), + Some("psmtsv") => { + OpairData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box) + } + Some("fasta") => FastaData::parse_file(path).map(|peptides| { + Box::new(peptides.into_iter().map(|p| Ok(p.into()))) + as Box> + 'a> + }), + Some("txt") => { + MaxQuantData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box) + } + _ => Err(CustomError::error( + "Unknown extension", + "Use CSV, TSV, TXT, PSMTSV, or Fasta, or any of these as a gzipped file (eg csv.gz).", + Context::show(path.to_string_lossy()), + )), + } +} diff --git a/rustyms/src/identification/identified_peptide.rs b/rustyms/src/identification/identified_peptide.rs index c5aeec9..0e7c71d 100644 --- a/rustyms/src/identification/identified_peptide.rs +++ b/rustyms/src/identification/identified_peptide.rs @@ -237,3 +237,19 @@ where } } } + +impl<'lifetime, R, I> IdentifiedPeptideIter<'lifetime, R, I> +where + R: IdentifiedPeptideSource + Into + 'lifetime, + I: Iterator> + 'lifetime, + R::Format: 'static, +{ + pub(super) fn into_box( + self, + ) -> Box> + 'lifetime> { + Box::new(self.map(|p: Result| match p { + Ok(p) => Ok(p.into()), + Err(e) => Err(e), + })) + } +} diff --git a/rustyms/src/identification/mod.rs b/rustyms/src/identification/mod.rs index dcd3d9e..96d8bc2 100644 --- a/rustyms/src/identification/mod.rs +++ b/rustyms/src/identification/mod.rs @@ -4,6 +4,7 @@ mod common_parser; mod fasta; +mod general; mod helper_functions; mod identified_peptide; mod maxquant; @@ -15,6 +16,7 @@ mod sage; use crate::*; pub use fasta::*; +pub use general::*; pub use identified_peptide::*; pub use maxquant::*; pub use msfragger::*;