diff --git a/Cargo.lock b/Cargo.lock index 08e016c..9a0e82a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -672,7 +672,7 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "rsbio-seq" -version = "0.1.1" +version = "0.1.2" dependencies = [ "bio", "flate2", diff --git a/Cargo.toml b/Cargo.toml index b417e21..c2cb048 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,12 @@ [package] name = "rsbio-seq" -version = "0.1.1" +version = "0.1.2" edition = "2021" authors = [ "Anuradha Wickramarachchi ", "Vijini Mallawaarachchi ", ] -description = "RSBio-Seq is a python wrapper for rust bio crate to provide fast sequence reading." +description = "RSBio-Seq is a fast and light-weight sequence reading library (built on top of rust bio crate)." readme = "README.md" license-file = "LICENSE" diff --git a/README.md b/README.md index f437ab1..dcd3216 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,19 @@ [![PyPI - Version](https://img.shields.io/pypi/v/rsbio-seq)](https://pypi.org/project/rsbio-seq/) [![Upload to PyPI](https://github.com/anuradhawick/rsbio-seq/actions/workflows/pypi.yml/badge.svg)](https://github.com/anuradhawick/rsbio-seq/actions/workflows/pypi.yml) -RSBio intends to provide just reading facility on common sequence formats (FASTA/FASTQ) in both raw and compressed formats. +RSBio-Seq intends to provide just reading facility on common sequence formats (FASTA/FASTQ) in both raw and compressed formats. -## Build and install from source +## Installation + +### 1. From PyPI (Recommended) + +Simple use the following command + +```bash +pip install rsbio-seq +``` + +### 2. Build and install from source To build you need to have the following installed. @@ -20,26 +30,21 @@ maturin develop # this installs the development version in the env maturin develop --rust # this installs a release version in the env ``` -To build a wheel +To build a wheel for installation ```bash maturin build --release ``` -You will find the `whl` file inside the `target/wheels` directory. - -## Install from PyPI +You will find the `whl` file inside the `target/wheels` directory. Your `whl` file will have a name depicting your python environment and CPU architecture. -Simple use the following command - -```bash -pip install rsbio-seq -``` ## Usage Once installed you can import the library and use as follows. +### Reading + ```python from rsbio_seq import SeqReader, SeqWriter, Sequence @@ -51,7 +56,11 @@ for seq in SeqReader("path/to/seq.fasta.gz"): print(seq.qual) # optional description attribute print(seq.desc) +``` +### Writing + +```python # writing fasta seq = Sequence("id", "desc", "ACGT") # id, description, sequence writer = SeqWriter("out.fasta") diff --git a/pyproject.toml b/pyproject.toml index a35792c..fc8ef15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,12 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dynamic = ["version", "readme", "description", "license", "authors"] +keywords = ["bioinformatics", "genomics"] + +[project.urls] +Documentation = "https://github.com/anuradhawick/rsbio-seq/" +"Bug Tracker" = "https://github.com/anuradhawick/rsbio-seq/issues" +"Source Code" = "https://github.com/anuradhawick/rsbio-seq/" [tool.maturin] features = ["pyo3/extension-module"] diff --git a/src/writer.rs b/src/writer.rs index 6bdce36..2a35621 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -2,6 +2,7 @@ use crate::seq::{SeqFormat, Sequence}; use flate2::{write::GzEncoder, Compression}; use std::{fs::File, io::Write}; +#[inline] fn wrap_string_no_whitespace(s: &str, width: usize) -> String { let mut result = String::with_capacity(s.len() + s.len() / width); let mut i = 0; @@ -47,11 +48,13 @@ impl Writer { } } + #[inline] pub fn write(&mut self, seq: Sequence, wrap: Option) -> Result<(), String> { let writer = match &mut self.writer { WriterType::Gzip(gz) => gz as &mut dyn Write, WriterType::Plain(file) => file as &mut dyn Write, }; + match self.format { SeqFormat::Fasta => { let seq_str = if let Some(wrap) = wrap { @@ -62,38 +65,33 @@ impl Writer { } else { seq.seq }; - writer.write_all(b">").map_err(|e| e.to_string())?; - writer - .write_all(seq.id.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b" ").map_err(|e| e.to_string())?; - writer - .write_all(seq.desc.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b"\n").map_err(|e| e.to_string())?; - writer - .write_all(seq_str.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b"\n").map_err(|e| e.to_string())?; + let mut buffer = + Vec::with_capacity(4 + seq.id.len() + seq.desc.len() + seq_str.len()); + + buffer.extend_from_slice(b">"); + buffer.extend_from_slice(seq.id.as_bytes()); + buffer.extend_from_slice(b" "); + buffer.extend_from_slice(seq.desc.as_bytes()); + buffer.extend_from_slice(b"\n"); + buffer.extend_from_slice(seq_str.as_bytes()); + buffer.extend_from_slice(b"\n"); + writer.write_all(&buffer).map_err(|e| e.to_string())?; } SeqFormat::Fastq => { - writer.write_all(b"@").map_err(|e| e.to_string())?; - writer - .write_all(seq.id.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b" ").map_err(|e| e.to_string())?; - writer - .write_all(seq.desc.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b"\n").map_err(|e| e.to_string())?; - writer - .write_all(seq.seq.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b"\n+\n").map_err(|e| e.to_string())?; - writer - .write_all(seq.qual.as_bytes()) - .map_err(|e| e.to_string())?; - writer.write_all(b"\n").map_err(|e| e.to_string())?; + let mut buffer = Vec::with_capacity( + 7 + seq.id.len() + seq.desc.len() + seq.seq.len() + seq.qual.len(), + ); + + buffer.extend_from_slice(b"@"); + buffer.extend_from_slice(seq.id.as_bytes()); + buffer.extend_from_slice(b" "); + buffer.extend_from_slice(seq.desc.as_bytes()); + buffer.extend_from_slice(b"\n"); + buffer.extend_from_slice(seq.seq.as_bytes()); + buffer.extend_from_slice(b"\n+\n"); + buffer.extend_from_slice(seq.qual.as_bytes()); + buffer.extend_from_slice(b"\n"); + writer.write_all(&buffer).map_err(|e| e.to_string())?; } } Ok(())