Skip to content

Commit

Permalink
merged with main
Browse files Browse the repository at this point in the history
  • Loading branch information
Matiss Ozols committed Feb 2, 2024
2 parents f62de8d + a371283 commit d8b3a6f
Show file tree
Hide file tree
Showing 63 changed files with 1,676 additions and 3,321 deletions.
12 changes: 11 additions & 1 deletion bin/bff.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@ library(DropletUtils)
library(Seurat)
library(ggplot2)
library(cowplot)
if(!require("cellhashR")){
devtools::install_github(repo = 'bimberlab/cellhashR', ref = 'master', dependencies = TRUE, upgrade = 'always')
library("cellhashR")
}

library(cellhashR)
library(here)
library(dplyr)
library(argparse)
library(tidyverse)

if(!require("tidyverse")){
install.packages("tidyverse")
library("tidyverse")
}


# Create a parser
parser <- ArgumentParser("Parameters for BFF")
Expand Down
151 changes: 151 additions & 0 deletions bin/filter_bam_file_for_popscle_dsc_pileup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/bin/bash
#
# Copyright (C): 2020-2021 - Gert Hulselmans
#
# Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:
# - which overlap with SNPs in the VCF file
# - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list
# Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly
# (depending on the reduction of the number of reads in the filtered BAM file vs original).



# Function to check if any of the programs in a pipe failed.
check_exit_codes () {
local GET_PIPESTATUS="${PIPESTATUS[@]}";
local exit_code;

for exit_code in ${GET_PIPESTATUS} ; do
if [ ${exit_code} -ne 0 ] ; then
return ${exit_code};
fi
done

return 0;
}



# Check if necessary programs are installed.
check_if_programs_exists () {
local exit_code=0;

# Check if bedtools is installed.
if ! type bedtools > /dev/null 2>&1 ; then
printf 'Error: "bedtools" could not be found in PATH.\n' > /dev/stderr;
exit_code=2;
fi

# Check if samtools is installed.
if ! type samtools > /dev/null 2>&1 ; then
printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr;
exit_code=2;
fi

if [ ${exit_code} -eq 2 ] ; then
return ${exit_code};
fi

# Check if samtools 1.10 or higher is installed (needs to have "-D STR:FILE" or "-D, --tag-file STR:FILE" option).
if ! samtools view --help 2>&1 | grep -q -- '-D.*STR:FILE' ; then
printf 'Error: The version of "samtools" (%s) should be 1.10 or higher (%s found).\n' \
"$(type samtools)" \
"$(samtools --version | head -n 1)" \
> /dev/stderr;
exit_code=2;
fi

return ${exit_code};
}



filter_bam_file_for_popscle_dsc_pileup () {
local input_bam_filename="${1}";
local barcodes_tsv_filename="${2}";
local vcf_filename="${3}";
local output_bam_filename="${4}";
local barcode_tag="${5:-CB}";

local exit_code=0;

if [ ${#@} -lt 4 ] ; then
printf 'Usage: filter_bam_file_for_popscle_dsc_pileup input_bam_filename barcodes_tsv_filename vcf_filename output_bam_filename [barcode_tag]\n\n';
printf 'Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:\n';
printf ' - which overlap with SNPs in the VCF file\n';
printf ' - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list\n';
printf ' Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly\n';
printf ' (depending on the reduction of the number of reads in the filtered BAM file vs original).\n\n';

return 1;
fi

if [ ! -f "${input_bam_filename}" ] ; then
printf 'Error: Input (CellRanger) BAM file "%s" could not be found.\n' "${input_bam_filename}" > /dev/stderr;
return 2;
fi

if [ ! -f "${barcodes_tsv_filename}" ] ; then
printf 'Error: File with barcodes "%s" could not be found.\n' "${barcodes_tsv_filename}" > /dev/stderr;
return 2;
fi

if [ ! -f "${vcf_filename}" ] ; then
printf 'Error: File with unique SNPs per sample "%s" could not be found.\n' "${vcf_filename}" > /dev/stderr;
return 2;
fi

if [ ${#barcode_tag} -ne 2 ] ; then
printf 'Error: Barcode tag "%s" should be 2 characters.\n' "${barcode_tag}" > /dev/stderr;
return 2;
fi

# Check if bedtools and samtools are in PATH.
if ! check_if_programs_exists ; then
return 2;
fi

# Create much smaller BAM file for dsc-pileup of popscle:
# - Convert VCF file with unique SNPs for each sample
# to a BED file and merge adjacent SNP regions to one.
# - Only include reads that contain a SNP position
# and which contain a cell barcode of interest.
if [ "${barcodes_tsv_filename%.gz}".gz = "${barcodes_tsv_filename}" ] ; then
# Barcodes file is compressed with gzip.
bedtools merge -i "${vcf_filename}" \
| samtools view\
-@ 8 \
--write-index \
-L - \
-D "${barcode_tag}":<(zcat "${barcodes_tsv_filename}") \
-o "${output_bam_filename}" \
"${input_bam_filename}";

# Check if any of the previous commands failed.
check_exit_codes;

exit_code=$?;
else
# Barcodes file is uncompressed.
bedtools merge -i "${vcf_filename}" \
| samtools view\
-@ 8 \
--write-index \
-L - \
-D "${barcode_tag}":"${barcodes_tsv_filename}" \
-o "${output_bam_filename}" \
"${input_bam_filename}";

# Check if any of the previous commands failed.
check_exit_codes;

exit_code=$?;
fi


return ${exit_code};
}



filter_bam_file_for_popscle_dsc_pileup "${@}";
206 changes: 206 additions & 0 deletions bin/sort_vcf_same_as_bam.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/bin/bash
#
# Copyright (C): 2020-2021 - Gert Hulselmans
#
# Purpose: Sort VCF file in the same order as the BAM file, so it can be used with popscle.



# Function to check if any of the programs in a pipe failed.
check_exit_codes () {
local GET_PIPESTATUS="${PIPESTATUS[@]}";
local exit_code;

for exit_code in ${GET_PIPESTATUS} ; do
if [ ${exit_code} -ne 0 ] ; then
return ${exit_code};
fi
done

return 0;
}



# Check if necessary programs are installed.
check_if_programs_exists () {
local exit_code=0;

# Check if awk is installed.
if ! type awk > /dev/null 2>&1 ; then
printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr;
exit_code=2;
fi

# Check if bcftools is installed.
if ! type bcftools > /dev/null 2>&1 ; then
printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr;
exit_code=2;
fi

# Check if samtools is installed.
if ! type samtools > /dev/null 2>&1 ; then
printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr;
exit_code=2;
fi

return ${exit_code};
}



# Get order of the contigs (chromosomes) and their length from the BAM header.
get_contig_order_from_bam () {
local bam_input_file="${1}";
local output_type="${2}";

if [ ${#@} -ne 2 ] ; then
printf 'Usage: get_contig_order_from_bam BAM_file output_type\n\n';
printf 'Arguments:\n';
printf ' - BAM_file: BAM file from which to get the contig order and contig lengths.\n';
printf ' - output_type:\n';
printf ' - "names": Return contig names.\n';
printf ' - "chrom_sizes": Return contig names and contig lengths.\n';
printf ' - "vcf": Return VCF header section for contigs.\n\n';
return 1;
fi

case "${output_type}" in
'names')
;;
'chrom_sizes')
;;
'vcf')
;;
*)
printf 'Error: output_type "%s" is not supported.\n' "${output_type}" > /dev/stderr;
return 1;
;;
esac

check_if_programs_exists || return $?;

# Get the order of the contigs from the BAM header.
samtools view -H "${bam_input_file}" \
| awk \
-F '\t' \
-v output_type="${output_type}" \
'
{
# Only look at sequence header fields.
if ($1 == "@SQ") {
contig_idx += 1;
contig_name = "";
contig_length = "";
# Extract contig (chromosome) name and contig (chromosome) length.
for (i = 2; i <= NF; i++) {
if ($i ~ /^SN:/) {
contig_name = substr($i, 4);
}
if ($i ~ /^LN:/) {
contig_length = substr($i, 4);
}
# Create contig order to name and contig order to length and vcf contig appings.
contig_idx_to_name[contig_idx] = contig_name;
contig_idx_to_length[contig_idx] = contig_length;
contig_idx_to_vcf_contig[contig_idx] = sprintf("##contig=<ID=%s,length=%s>", contig_name, contig_length);
}
}
} END {
if (contig_idx == 0) {
printf "Error: No \"@SQ\" header line found in BAM file.\n" > "/dev/stderr";
exit(1);
} else if (output_type == "names") {
contig_names = "";
for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) {
contig_names = contig_names " " contig_idx_to_name[contig_idx];
}
# Print all contig names (without leading space).
print substr(contig_names, 2);
} else if (output_type == "chrom_sizes") {
# Print all contig names with their length in a TAB separated fashion.
for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) {
print contig_idx_to_name[contig_idx] "\t" contig_idx_to_length[contig_idx];
}
} else if (output_type == "vcf") {
# Print VCF header section for contigs.
for (contig_idx = 1; contig_idx <= length(contig_idx_to_vcf_contig); contig_idx++) {
print contig_idx_to_vcf_contig[contig_idx];
}
}
}'

check_exit_codes;

return $?;
}



# Sort VCF file in the same order as the BAM file, so it can be used with popscle.
sort_vcf_same_as_bam () {
local bam_input_file="${1}";
local vcf_input_file="${2}";
local vcf_type="${3:-v}";

if [ ${#@} -lt 2 ] ; then
printf 'Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type]\n\n';
printf 'Arguments:\n';
printf ' - BAM_file: BAM file from which to get the contig order to sort the VCF file.\n';
printf ' - VCF_file: VCF file to sort by contig order as defined in the BAM file.\n';
printf ' - VCF_type: VCF ouput file type (default: same as input VCF file type):\n';
printf ' v: uncompressed VCF, z: compressed VCF,\n';
printf ' u: uncompressed BCF, b: compressed BCF\n\n';
printf 'Purpose:\n';
printf ' Sort VCF file in the same order as the BAM file, so it can be used with popscle.\n\n';
return 1;
fi

check_if_programs_exists || return $?;

# If VCF type is not specified, try to guess it from the filename extension.
if [ ${#@} -eq 2 ] ; then
if [ "${vcf_input_file%.vcf.gz}" != "${vcf_input_file}" ] ; then
vcf_type='z';
elif [ "${vcf_input_file%.bcf}" != "${vcf_input_file}" ] ; then
vcf_type='b';
fi
fi

# Sort VCF file by same chromosome order as BAM file.
cat <(
# Create new VCF header:
# - Get VCF header of VCF input file.
# - Remove all contig header lines and "#CHROM" line from the VCF header.
# - Append contig headers in the order they appear in the input BAM file.
# - Add "#CHROM" line as last line of the new VCF header.
bcftools view -h "${vcf_input_file}" \
| awk \
'
{
if ($1 !~ /^##contig=/ && $1 !~ /^#CHROM/) {
# Remove all contig header lines and "#CHROM" line.
print $0;
}
}' \
| cat \
- \
<(get_contig_order_from_bam "${bam_input_file}" 'vcf') \
<(bcftools view -h "${vcf_input_file}" | tail -n 1) \
) \
<(bcftools view -H -O v "${vcf_input_file}") \
| bcftools sort -O "${vcf_type}";

check_exit_codes;

return $?;
}



sort_vcf_same_as_bam "${@}";
2 changes: 2 additions & 0 deletions conda/bff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::r-tidyverse
- conda-forge::r-base>=4.2
- r-seurat
- bioconductor-dropletutils
Expand All @@ -17,3 +18,4 @@ dependencies:
- conda-forge::r-here
- conda-forge::r-argparse
- conda-forge::r-dplyr

Loading

0 comments on commit d8b3a6f

Please sign in to comment.