make_fraglen_model.R

## get the empirical fragment length distributions from GEUVADIS samples
## (maybe 1 from each lab?)
## save empirical distribution as a dataset in Polyester package
## I worked in a folder with the same root directory as the "polyester" R package

library(ballgown)
load('fpkm.rda') #link: http://files.figshare.com/1625419/fpkm.rda

pd = pData(fpkm)

library(RCurl)
QCurl = getURL('https://raw.githubusercontent.com/alyssafrazee/ballgown_code/master/GEUVADIS_preprocessing/GD667.QCstats.masterfile.txt')
qcstats = read.table(textConnection(QCurl), sep='\t', header=TRUE)
pd$lab = qcstats$SeqLabNumber[match(pd$SampleID, rownames(qcstats))]

# samples to analyze: 1 from each lab
set.seed(14)
lab1id = sample(pd$dirname[pd$lab == 'F1'], size=1)
lab2id = sample(pd$dirname[pd$lab == 'F2'], size=1)
lab3id = sample(pd$dirname[pd$lab == 'F3'], size=1)
lab4id = sample(pd$dirname[pd$lab == 'F4'], size=1)
lab5id = sample(pd$dirname[pd$lab == 'F5'], size=1)
lab6id = sample(pd$dirname[pd$lab == 'F6'], size=1)
lab7id = sample(pd$dirname[pd$lab == 'F7'], size=1)
selected = c(lab1id, lab2id, lab3id, lab4id, lab5id, lab6id, lab7id)
selected
# "NA06985" "NA18858" "NA20772" "NA12144" "NA20815" "NA12776" "NA20542"
# 3 CEU, 1 YRI, 3 TSI

# download the bam files from each of those samples:
for(s in selected){
    system(paste0('wget http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-6/',
        s, '_accepted_hits.bam'), wait=TRUE)
} #UGH this takes so long, about 20-40 minutes per file, depending on connection speed.
# could do in parallel
# (but more difficult to script) 

# run "CollectInsertSizeMetrics", from Picard:
system('sh insert_sizes.sh', wait=TRUE)
## SCRIPT CONTENTS:
# #!/bin/sh
# for sample in NA06985 NA18858 NA20772 NA12144 NA20815 NA12776 NA20542
# do
#     java -Xmx2g -jar picard-tools-1.121/CollectInsertSizeMetrics.jar HISTOGRAM_FILE=${sample}_histogram INPUT=${sample}_accepted_hits.bam OUTPUT=${sample}_metrics
# done
# takes about 25 minutes

# fit a distribution to the insert sizes:
library(IRanges)
library(logspline)
frag_sizes = Rle()
set.seed(212)
for(s in selected){
    ind = which(selected == s)
    metrics = file(paste0(s, '_metrics'))
    info = strsplit(readLines(metrics), '\t')
    close(metrics)
    dat = t(as.data.frame(info[-c(1:11, length(info))])) #extract sizes/counts
    rownames(dat) = NULL
    dat = matrix(as.numeric(dat), nrow=nrow(dat), ncol=ncol(dat))
    dat = as.data.frame(dat)
    names(dat) = c('size', 'count')
    allnums = Rle(values=dat$size, lengths=dat$count)
    samplednums = sort(sample(allnums, 1e5))
    frag_sizes = sort(c(frag_sizes, samplednums))
}
empirical_density = logspline(as.integer(frag_sizes), 
    lbound=min(frag_sizes), ubound=max(frag_sizes))
save(empirical_density, file='../polyester/data/empirical_density.rda', compress='xz')

# we can then draw from the empirical density in the fragmentation function!

# plot for the paper:
d = density(as.integer(frag_sizes))
pdf('empirical_density.pdf')
    x = seq(75, max(frag_sizes), by=1)
    plot(x, dnorm(x, 250, 25), col='blue', 
        type='l', lwd=2, xlab='Fragment Length', ylab='Density')
    lines(d, col='red', lwd=2)
    #with(d, polygon(x=c(min(frag_sizes), d$x, max(frag_sizes)), 
    #                y=c(0, d$y, 0), col="gray"))
    legend('topright', col=c('blue', 'red'), c('N(250, 25)', 'Empirical'), 
        lwd=c(2,2))
    title('Normal and empirical fragment length distributions')
dev.off()