Skip to content
This repository has been archived by the owner on Jun 18, 2024. It is now read-only.

Add realigner tools #4

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tools/bam2fastq2
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Converts a paired end bam into two fastqs
set -e
bam=$1
fastq1=$2
fastq2=$3
samtools sort -n $bam $bam.qsort
bedtools bamtofastq -i $bam.qsort.bam -fq $fastq1 -fq2 $fastq2
gzip $fastq1
gzip $fastq2
9 changes: 9 additions & 0 deletions tools/bwa.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

set -e

# Usage: ./bwa.sh <bwa executable location> <reference genome fasta> <forward fastq> <reverse fastq> <bam destination (without the .bam extension)>

bamfile=`mktemp --tmpdir=$(pwd) -t XXXXXX.bam.tmp`
$1 mem -R '@RG\tID:foo\tSM:bar' -M -c 16000 -t 25 $2 $3 $4 | samtools view -uSb /dev/stdin > $bamfile
samtools sort -m 4G $bamfile $5
45 changes: 45 additions & 0 deletions tools/mate_dupe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""If a read fragment spans the boundary of a simulated SV then potentially one mate will be deleted or duplicated and the other will not.

In this state the bam cannot be converted back into 2 fastq files because there's now a mismatch between the mates.

This script is a bit of a hack in that it just fills in the missing mate for all unpaired mates, but it makes it possible to re generate a simulated fastq.
"""
from collections import defaultdict
import copy
import pysam
import sys


def main():
inbam, out_bam = sys.argv[1:]
# First pass, make set of synthetics where only one side is represented
syn_set = set()
inbamf = pysam.AlignmentFile(inbam, 'rb')
for read in inbamf.fetch():
if '-' not in read.qname:
# not synthetic
continue
first = read.is_read1
fid = (read.qname, first)
assert fid not in syn_set, 'i dont get it'
if (read.qname, not first) in syn_set:
syn_set.remove((read.qname, not first))
else:
syn_set.add(fid)
# Make dict of originals to synthetics
qname2synthetics = defaultdict(set)
for qname, first in syn_set:
original_qname = qname.split('-')[0]
qname2synthetics[original_qname].add((qname, first))
# pass through bam and fill in the needed reads
with pysam.AlignmentFile(out_bam, 'wb', template=inbamf) as outbamf:
for read in inbamf.fetch():
if read.qname not in qname2synthetics:
outbamf.write(read)
continue
for alt_qname, first in qname2synthetics[read.qname]:
if read.is_read1 != first:
new_read = copy.deepcopy(read)
new_read.qname = alt_qname
outbamf.write(new_read)
main()
14 changes: 14 additions & 0 deletions tools/realign_bam
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Takes a simulated bam file and clean up a bit at the edges and realign to genome (to get alignment right)

set -e
bam=$1

python mate_dupe.py ${bam} ${bam%.bam}.mateduped.bam

bam2fastq2 ${bam%.bam}.mateduped.bam fq1 fq2

bwa.sh `which bwa` /resources/GRCh37.p12.fa fq1.gz fq2.gz ${bam%.bam}.realigned

samtools index ${bam%.bam}.realigned.bam

rm fq1.gz fq2.gz