Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #139

Merged
merged 15 commits into from
Mar 8, 2024
1,232 changes: 627 additions & 605 deletions Cargo.lock

Large diffs are not rendered by default.

81 changes: 49 additions & 32 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "alevin-fry"
version = "0.8.2"
version = "0.9.0"
authors = [
"Avi Srivastava <avi.srivastava@nyu.edu>",
"Hirak Sarkar <hirak_sarkar@hms.harvard.edu>",
Expand Down Expand Up @@ -32,56 +32,73 @@ keywords = [
]
categories = ["command-line-utilities", "science"]

[workspace]

[dependencies]
# for local development, look in the libradicl git repository
# but when published, pull the specified version
libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.6.0" }
anyhow = "1.0.71"
libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.8.2" }
anyhow = "1.0.80"
arrayvec = "0.7.4"
ahash = "0.8.3"
ahash = "0.8.11"
bincode = "1.3.3"
bstr = "1.5.0"
crossbeam-channel = "0.5.8"
crossbeam-queue = "0.3.8"
# derive_builder = "0.11.2"
typed-builder = "0.14.0"
indicatif = "0.17.5"
bstr = "1.9.1"
crossbeam-channel = "0.5.12"
crossbeam-queue = "0.3.11"
typed-builder = "0.18.1"
indicatif = "0.17.8"
needletail = "0.5.1"
petgraph = "0.6.3"
flate2 = "1.0.26"
scroll = "0.11.0"
serde = { version = "1.0.164", features = ["derive"] }
serde_json = "1.0.99"
petgraph = "0.6.4"
flate2 = "1.0.28"
scroll = "0.12.0"
serde = { version = "1.0.197", features = ["derive"] }
serde_json = "1.0.114"
sprs = "0.11.1"
slog = "2.7.0"
slog-term = "2.9.0"
slog-async = "2.7.0"
smallvec = "1.10.0"
snap = "1.1.0"
slog-term = "2.9.1"
slog-async = "2.8.0"
smallvec = "1.13.1"
snap = "1.1.1"
rand = "0.8.5"
chrono = "0.4.26"
csv = "1.2.2"
mimalloc = { version = "0.1.37", default-features = false }
chrono = "0.4.35"
csv = "1.3.0"
mimalloc = { version = "0.1.39", default-features = false }
num-format = "0.4.4"
num_cpus = "1.16.0"
bio-types = { version = "1.0.0", default-features = true, features = ["serde"] }
itertools = "0.11.0"
thiserror = "1.0.40"
bio-types = { version = "1.0.1", default-features = true, features = ["serde"] }
itertools = "0.12.1"
thiserror = "1.0.57"
statrs = "0.16.0"
rust-htslib = { version = "0.44.1", default-features = false, features = [
"bzip2",
"lzma",
] }
sce = { git = "https://github.com/parazodiac/SingleCellExperiment", branch = "dev", version = "0.2.0" }

# no shenanigans; clap makes breaking "fixes" too often to allow variability
# in the version different from what we tested with
clap = { version = "=4.3.9", features = ["derive", "wrap_help", "cargo", "help", "usage", "string", "error-context"] }
clap = { version = "=4.5.2", features = ["derive", "wrap_help", "cargo", "help", "usage", "string", "error-context"] }

noodles = { version = "0.65.0", features = ["bam", "bgzf", "sam"] }
noodles-util = { version = "0.37.0", features = ["alignment"] }

[profile.release]
#debug = true
lto = "thin"
#codegen-units=1
opt-level = 3

# The profile that 'cargo dist' will build with
[profile.dist]
inherits = "release"
lto = "thin"

# Config for 'cargo dist'
[workspace.metadata.dist]
# The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax)
cargo-dist-version = "0.11.1"
# CI backends to support
ci = ["github"]
# The installers to generate for each app
installers = ["shell"]
# Target platforms to build apps for (Rust target-triple syntax)
targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu"]
# Publish jobs to run in CI
pr-run-mode = "plan"

[workspace.metadata.dist.github-custom-runners]
aarch64-apple-darwin = "macos-14"
2 changes: 1 addition & 1 deletion docs/source/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Overview

`alevin-fry`` is a suite of tools for the rapid, accurate and memory-frugal processing single-cell and single-nucleus sequencing data. It consumes RAD files generated by `salmon alevin`, and performs common operations like generating permit lists, and estimating the number of distinct molecules from each gene within each cell. The focus in `alevin-fry`` is on safety, accuracy and efficiency (in terms of both time and memory usage).

You can read the paper describing alevin fry, "Alevin-fry unlocks rapid, accurate, and memory-frugal quantification of single-cell RNA-seq data" [here](https://www.nature.com/articles/s41592-022-01408-3), and the pre-print [on bioRxiv](https://www.biorxiv.org/content/10.1101/2021.06.29.450377v1).
You can read the paper describing alevin fry, "Alevin-fry unlocks rapid, accurate, and memory-frugal quantification of single-cell RNA-seq data" `here <https://www.nature.com/articles/s41592-022-01408-3>`_, and the pre-print `on bioRxiv <https://www.biorxiv.org/content/10.1101/2021.06.29.450377v1>`_.

Other resources for alevin-fry
==============================
Expand Down
6 changes: 6 additions & 0 deletions scripts/bsd-3.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Copyright (c) ${years} ${owner}.

This file is part of ${projectname}
(see ${projecturl}).

License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
10 changes: 10 additions & 0 deletions scripts/reheader.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

if ! command -v licenseheaders &> /dev/null
then
echo "licenseheaders could not be found; please install this program to use the script (pip install licenseheaders)."
exit 1
fi


licenseheaders -d ../src -y 2020-2024 -t bsd-3.tmpl -o COMBINE-lab -n alevin-fry -u https://www.github.com/COMBINE-lab/alevin-fry -D -E rs
79 changes: 48 additions & 31 deletions src/cellfilter.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/*
* Copyright (c) 2020-2022 Rob Patro, Avi Srivastava, Hirak Sarkar, Dongze He, Mohsen Zakeri.
* Copyright (c) 2020-2024 COMBINE-lab.
*
* This file is part of alevin-fry
* (see https://github.com/COMBINE-lab/alevin-fry).
* (see https://www.github.com/COMBINE-lab/alevin-fry).
*
* License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
*/
Expand All @@ -19,8 +19,13 @@ use bio_types::strand::Strand;
use bstr::io::BufReadExt;
use itertools::Itertools;
use libradicl::exit_codes;
use libradicl::rad_types;
use libradicl::rad_types::{self, RadType};
use libradicl::BarcodeLookupMap;
use libradicl::{
chunk,
header::RadPrelude,
record::{AlevinFryReadRecord, AlevinFryRecordContext},
};
use needletail::bitkmer::*;
use num_format::{Locale, ToFormattedString};
use serde::Serialize;
Expand Down Expand Up @@ -222,7 +227,7 @@ fn populate_unfiltered_barcode_map<T: Read>(
fn process_unfiltered(
mut hm: HashMap<u64, u64, ahash::RandomState>,
mut unmatched_bc: Vec<u64>,
ft_vals: &rad_types::FileTags,
file_tag_map: &rad_types::TagMap,
filter_meth: &CellFilterMethod,
expected_ori: Strand,
output_dir: &PathBuf,
Expand Down Expand Up @@ -279,9 +284,14 @@ fn process_unfiltered(
num_passing.to_formatted_string(&Locale::en)
);

let barcode_tag = file_tag_map
.get("cblen")
.expect("tag map must contain cblen");
let barcode_len: u16 = barcode_tag.try_into()?;

// now, we create a second barcode map with just the barcodes
// for cells we will keep / rescue.
let bcmap2 = BarcodeLookupMap::new(kept_bc, ft_vals.bclen as u32);
let bcmap2 = BarcodeLookupMap::new(kept_bc, barcode_len as u32);
info!(
log,
"found {} cells with non-trivial number of reads by exact barcode match",
Expand Down Expand Up @@ -382,7 +392,7 @@ fn process_unfiltered(
})?;
let o_path = parent.join("permit_freq.bin");

match afutils::write_permit_list_freq(&o_path, ft_vals.bclen, &hm) {
match afutils::write_permit_list_freq(&o_path, barcode_len, &hm) {
Ok(_) => {}
Err(error) => {
panic!("Error: {}", error);
Expand Down Expand Up @@ -444,7 +454,7 @@ fn process_unfiltered(
#[allow(clippy::unnecessary_unwrap, clippy::too_many_arguments)]
fn process_filtered(
hm: &HashMap<u64, u64, ahash::RandomState>,
ft_vals: &rad_types::FileTags,
file_tag_map: &rad_types::TagMap,
filter_meth: &CellFilterMethod,
expected_ori: Strand,
output_dir: &PathBuf,
Expand All @@ -460,6 +470,11 @@ fn process_filtered(
freq.sort_unstable();
freq.reverse();

let barcode_tag = file_tag_map
.get("cblen")
.expect("tag map must contain cblen");
let barcode_len: u16 = barcode_tag.try_into()?;

// select from among supported filter methods
match filter_meth {
CellFilterMethod::KneeFinding => {
Expand Down Expand Up @@ -489,7 +504,7 @@ fn process_filtered(
valid_bc = permit_list_from_threshold(hm, min_freq);
}
CellFilterMethod::ExplicitList(valid_bc_file) => {
valid_bc = permit_list_from_file(valid_bc_file, ft_vals.bclen);
valid_bc = permit_list_from_file(valid_bc_file, barcode_len);
}
CellFilterMethod::ExpectCells(expected_num_cells) => {
let robust_quantile = 0.99f64;
Expand All @@ -509,7 +524,7 @@ fn process_filtered(
// generate the map from each permitted barcode to all barcodes within
// edit distance 1 of it.
let full_permit_list =
afutils::generate_permitlist_map(&valid_bc, ft_vals.bclen as usize).unwrap();
afutils::generate_permitlist_map(&valid_bc, barcode_len as usize).unwrap();

let s2 = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
let mut permitted_map = HashMap::with_capacity_and_hasher(valid_bc.len(), s2);
Expand All @@ -532,7 +547,7 @@ fn process_filtered(
})?;
let o_path = parent.join("permit_freq.bin");

match afutils::write_permit_list_freq(&o_path, ft_vals.bclen, &permitted_map) {
match afutils::write_permit_list_freq(&o_path, barcode_len, &permitted_map) {
Ok(_) => {}
Err(error) => {
panic!("Error: {}", error);
Expand All @@ -541,7 +556,7 @@ fn process_filtered(

let o_path = parent.join("all_freq.bin");

match afutils::write_permit_list_freq(&o_path, ft_vals.bclen, hm) {
match afutils::write_permit_list_freq(&o_path, barcode_len, hm) {
Ok(_) => {}
Err(error) => {
panic!("Error: {}", error);
Expand Down Expand Up @@ -630,32 +645,35 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result<u64>

let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?;
let mut br = BufReader::new(i_file);
let hdr = rad_types::RadHeader::from_bytes(&mut br);

let prelude = RadPrelude::from_bytes(&mut br)?;
let hdr = &prelude.hdr;
info!(
log,
"paired : {:?}, ref_count : {}, num_chunks : {}",
hdr.is_paired != 0,
hdr.ref_count.to_formatted_string(&Locale::en),
hdr.num_chunks.to_formatted_string(&Locale::en)
);

// file-level
let fl_tags = rad_types::TagSection::from_bytes(&mut br);
let fl_tags = &prelude.file_tags;
info!(log, "read {:?} file-level tags", fl_tags.tags.len());
// read-level
let rl_tags = rad_types::TagSection::from_bytes(&mut br);
let rl_tags = &prelude.read_tags;
info!(log, "read {:?} read-level tags", rl_tags.tags.len());

// right now, we only handle BC and UMI types of U8—U64, so validate that
const BNAME: &str = "b";
const UNAME: &str = "u";

let mut bct: Option<u8> = None;
let mut umit: Option<u8> = None;
let mut bct: Option<RadType> = None;
let mut umit: Option<RadType> = None;

for rt in &rl_tags.tags {
// if this is one of our tags
if rt.name == BNAME || rt.name == UNAME {
if rad_types::decode_int_type_tag(rt.typeid).is_none() {
if !rt.typeid.is_int_type() {
crit!(
log,
"currently only RAD types 1--4 are supported for 'b' and 'u' tags."
Expand All @@ -671,21 +689,19 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result<u64>
}
}
}
assert!(bct.is_some(), "barcode type tag must be present.");
assert!(umit.is_some(), "umi type tag must be present.");

// alignment-level
let al_tags = rad_types::TagSection::from_bytes(&mut br);
let al_tags = &prelude.aln_tags;
info!(log, "read {:?} alignemnt-level tags", al_tags.tags.len());

let ft_vals = rad_types::FileTags::from_bytes(&mut br);
info!(log, "File-level tag values {:?}", ft_vals);
let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br)?;
info!(log, "File-level tag values {:?}", file_tag_map);

let record_context = prelude.get_record_context::<AlevinFryRecordContext>()?;
let mut num_reads: usize = 0;

let bc_type = rad_types::decode_int_type_tag(bct.expect("no barcode tag description present."))
.context("unknown barcode type id.")?;
let umi_type = rad_types::decode_int_type_tag(umit.expect("no umi tag description present"))
.context("unknown barcode type id.")?;

// if dealing with filtered type
let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
let mut hm = HashMap::with_hasher(s);
Expand All @@ -702,7 +718,8 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result<u64>
// the unfiltered_bc_count map must be valid in this branch
if let Some(mut hmu) = unfiltered_bc_counts {
for _ in 0..(hdr.num_chunks as usize) {
let c = rad_types::Chunk::from_bytes(&mut br, &bc_type, &umi_type);
let c =
chunk::Chunk::<AlevinFryReadRecord>::from_bytes(&mut br, &record_context);
num_orientation_compat_reads += update_barcode_hist_unfiltered(
&mut hmu,
&mut unmatched_bc,
Expand All @@ -723,7 +740,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result<u64>
process_unfiltered(
hmu,
unmatched_bc,
&ft_vals,
&file_tag_map,
&filter_meth,
expected_ori,
output_dir,
Expand All @@ -740,7 +757,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result<u64>
}
_ => {
for _ in 0..(hdr.num_chunks as usize) {
let c = rad_types::Chunk::from_bytes(&mut br, &bc_type, &umi_type);
let c = chunk::Chunk::<AlevinFryReadRecord>::from_bytes(&mut br, &record_context);
update_barcode_hist(&mut hm, &mut max_ambiguity_read, &c, &expected_ori);
num_reads += c.reads.len();
}
Expand All @@ -753,7 +770,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result<u64>
);
process_filtered(
&hm,
&ft_vals,
&file_tag_map,
&filter_meth,
expected_ori,
output_dir,
Expand Down Expand Up @@ -900,7 +917,7 @@ pub fn update_barcode_hist_unfiltered(
hist: &mut HashMap<u64, u64, ahash::RandomState>,
unmatched_bc: &mut Vec<u64>,
max_ambiguity_read: &mut usize,
chunk: &rad_types::Chunk,
chunk: &chunk::Chunk<AlevinFryReadRecord>,
expected_ori: &Strand,
) -> usize {
let mut num_strand_compat_reads = 0usize;
Expand Down Expand Up @@ -964,7 +981,7 @@ pub fn update_barcode_hist_unfiltered(
pub fn update_barcode_hist(
hist: &mut HashMap<u64, u64, ahash::RandomState>,
max_ambiguity_read: &mut usize,
chunk: &rad_types::Chunk,
chunk: &chunk::Chunk<AlevinFryReadRecord>,
expected_ori: &Strand,
) {
match expected_ori {
Expand Down
8 changes: 8 additions & 0 deletions src/cmd_parse_utils.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
/*
* Copyright (c) 2020-2024 COMBINE-lab.
*
* This file is part of alevin-fry
* (see https://www.github.com/COMBINE-lab/alevin-fry).
*
* License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
*/
use crate::quant::{ResolutionStrategy, SplicedAmbiguityModel};
use clap;
use std::path::{Path, PathBuf};
Expand Down
Loading
Loading