-
Notifications
You must be signed in to change notification settings - Fork 0
/
met1_vs_col0.R
178 lines (141 loc) · 9.71 KB
/
met1_vs_col0.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# Load in met1 and wt methylation data from Choi et al, 2021; compile mean values per gene
pathroot="X:/Daniel-Zilberman"
# Source directory containing tables downloaded from GEO
source_dir = paste0(pathroot,"/Projects/Jay-1001_methylomes/0-reference/supporting/met1")
# Working directory where outputs will be directed to
setwd(dir = paste0(pathroot,"/Projects/Jay-1001_methylomes/0-reference/supporting/met1/"))
reference_fasta = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/TAIR10/TAIR10_Chr.all.fasta") # TAIR10 genome assembly This never gets used at the moment
reference_CG_sites = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/TAIR10/TAIR10_Chr.all.CG_sites.tsv") # We prepared this earlier using a perl script to parse the TAIR10 genome assembly
reference_gff = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/Araport11/Araport11_GFF3_genes_transposons.201606.gff") # AraPort11 annotation
reference_exons = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/Araport11/GFF/Araport11_GFF3_genes_transposons.201606-exon.gff") # Prepared from AraPort11 annotation by a script in bs_sequel
reference_introns = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/Araport11/GFF/Araport11_GFF3_genes_transposons.201606-intron.gff") # Prepared from AraPort11 annotation by a script in bs_sequel
reference_5UTR = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/Araport11/GFF/Araport11_GFF3_genes_transposons.201606-five_prime_UTR.gff") # Prepared from AraPort11 annotation by a script in bs_sequel
reference_3UTR = paste0(pathroot,"/Reference_data/Genomes/Arabidopsis/Araport11/GFF/Araport11_GFF3_genes_transposons.201606-three_prime_UTR.gff") # Prepared from AraPort11 annotation by a script in bs_sequel
library(stringr)
# Load the annotation
annot_gff = read.delim(reference_gff, header=F, comment.char="#")
gff.exons = read.delim(reference_exons, header=F, comment.char="#")
gff.introns = read.delim(reference_introns, header=F, comment.char="#")
gff.5UTR = read.delim(reference_5UTR, header=F, comment.char="#")
gff.3UTR = read.delim(reference_3UTR, header=F, comment.char="#")
# Grab the portion relating to genes
gff.genes = annot_gff[annot_gff[,3]=="gene",]
# Grab the portion relating to transposons
gff.transposons = annot_gff[annot_gff[,3]=="transposable_element",]
rm(annot_gff)
# Convert chromosome names to uppercase to match previous objects
gff.genes$V1=toupper(gff.genes$V1)
gff.transposons$V1=toupper(gff.transposons$V1)
gff.exons$V1=toupper(gff.exons$V1)
gff.introns$V1=toupper(gff.introns$V1)
gff.5UTR$V1=toupper(gff.5UTR$V1)
gff.3UTR$V1=toupper(gff.3UTR$V1)
# The following line sometimes fails if stringi has not been installed correctly. Not sure why. Fix is to install.packages("stringi") and to allow RStudio to restart R.
# Grab gene IDs from descriptive field
gff.genes$gene_ID=str_split_fixed(str_split_fixed(gff.genes$V9, ';',3)[,1],"=",2)[,2]
gff.transposons$gene_ID=str_split_fixed(str_split_fixed(gff.transposons$V9, ';',3)[,1],"=",2)[,2]
gff.exons$gene_ID=str_split_fixed(str_split_fixed(gff.exons$V9, ';',3)[,1],"=",2)[,2]
gff.introns$gene_ID=str_split_fixed(str_split_fixed(gff.introns$V9, ';',3)[,1],"=",2)[,2]
gff.5UTR$gene_ID=str_split_fixed(str_split_fixed(gff.5UTR$V9, ';',3)[,1],"=",2)[,2]
gff.3UTR$gene_ID=str_split_fixed(str_split_fixed(gff.3UTR$V9, ';',3)[,1],"=",2)[,2]
#Gene names are tricky - need to find the element containing "symbol="
gff.genes=within(gff.genes, {gene_name=str_split_fixed(str_split_fixed(V9, ';',3)[,2],"=",2)[,2]})
gff.transposons=within(gff.transposons, {gene_name=str_split_fixed(str_split_fixed(V9, ';',3)[,3],"=",2)[,2]})
# Fix the gene names in exon, intron and UTRs
gff.exons$exon_ID = gff.exons$gene_ID
gff.exons$gene_ID = substr(gff.exons$gene_ID,1,9)
gff.introns$intron_ID = gff.introns$gene_ID
gff.introns$gene_ID = substr(gff.introns$gene_ID,1,9)
gff.5UTR$UTR_ID = gff.5UTR$gene_ID
gff.5UTR$gene_ID = substr(gff.5UTR$gene_ID,1,9)
gff.3UTR$UTR_ID = gff.3UTR$gene_ID
gff.3UTR$gene_ID = substr(gff.3UTR$gene_ID,1,9)
# Add the exon and intron numbers
gff.exons$exon_no = as.numeric(str_split_fixed(gff.exons$exon_ID, ':',3)[,3])
gff.introns$exon_no = as.numeric(str_split_fixed(gff.introns$intron_ID, ':',3)[,3])
# Find out how many exons each gene has
#install.packages("sqldf")
# commented out as sqldf is not installed on cluster R_zg-1.0.0
#library(sqldf)
#gff.genes=merge(gff.genes, sqldf('SELECT genes.gene_id, MAX(exon_no) AS no_exons FROM [gff.exons] exons, [gff.genes] genes WHERE genes.gene_ID=exons.gene_ID GROUP BY genes.gene_id'), by="gene_ID", all=TRUE)
library(GenomicRanges)
# Make a GRanges for gene space
gene_ranges=makeGRangesFromDataFrame(df = gff.genes, start.field = "V4", end.field = "V5",seqnames.field = "V1")
# fix chromosome IDs
levels(gene_ranges@seqnames@values) = substr(as.character(levels(gene_ranges@seqnames@values)),4,4)
gene_ranges@seqinfo@seqnames = levels(gene_ranges@seqnames@values)
# load in 50bp window methylation data
col0_CG = read.table(file="GSE122394_col.bisulfite.CG.w50.gff", sep="\t", header=TRUE)
col0_CHG = read.table(file="GSE122394_col.bisulfite.CHG.w50.gff", sep="\t", header=TRUE)
col0_CHH = read.table(file="GSE122394_col.bisulfite.CHH.w50.gff", sep="\t", header=TRUE)
met1_CG = read.table(file="GSE122394_met1.bisulfite.CG.w50.gff", sep="\t", header=TRUE)
met1_CHG = read.table(file="GSE122394_met1.bisulfite.CHG.w50.gff", sep="\t", header=TRUE)
met1_CHH = read.table(file="GSE122394_met1.bisulfite.CHH.w50.gff", sep="\t", header=TRUE)
colnames(col0_CG) = c("chr","dot","context","start","end","fraction","dot2","dot3","stuff")
colnames(col0_CHG) = c("chr","dot","context","start","end","fraction","dot2","dot3","stuff")
colnames(col0_CHH) = c("chr","dot","context","start","end","fraction","dot2","dot3","stuff")
colnames(met1_CG) = c("chr","dot","context","start","end","fraction","dot2","dot3","stuff")
colnames(met1_CHG) = c("chr","dot","context","start","end","fraction","dot2","dot3","stuff")
colnames(met1_CHH) = c("chr","dot","context","start","end","fraction","dot2","dot3","stuff")
col0_CG = col0_CG[,c(1,4,5,6)]
col0_CHG = col0_CHG[,c(1,4,5,6)]
col0_CHH = col0_CHH[,c(1,4,5,6)]
met1_CG = met1_CG[,c(1,4,5,6)]
met1_CHG = met1_CHG[,c(1,4,5,6)]
met1_CHH = met1_CHH[,c(1,4,5,6)]
col0_CG$fraction[col0_CG$fraction=="."] = NA
col0_CHG$fraction[col0_CHG$fraction=="."] = NA
col0_CHH$fraction[col0_CHH$fraction=="."] = NA
met1_CG$fraction[met1_CG$fraction=="."] = NA
met1_CHG$fraction[met1_CHG$fraction=="."] = NA
met1_CHH$fraction[met1_CHH$fraction=="."] = NA
col0_CG$fraction = as.numeric(col0_CG$fraction)
col0_CHG$fraction = as.numeric(col0_CHG$fraction)
col0_CHH$fraction = as.numeric(col0_CHH$fraction)
met1_CG$fraction = as.numeric(met1_CG$fraction)
met1_CHG$fraction = as.numeric(met1_CHG$fraction)
met1_CHH$fraction = as.numeric(met1_CHH$fraction)
all_samples_meth = col0_CG
#all_samples_meth = merge(col0_CG, col0_CHG, col0_CHH, met1_CG, met1_CHG, met1_CHH, by=c("chr","start","end"))
all_samples_meth = merge(col0_CG, col0_CHG, by=c(1:3))
all_samples_meth = merge(all_samples_meth, col0_CHH, by=c(1:3))
all_samples_meth = merge(all_samples_meth, met1_CG, by=c(1:3))
all_samples_meth = merge(all_samples_meth, met1_CHG, by=c(1:3))
all_samples_meth = merge(all_samples_meth, met1_CHH, by=c(1:3))
colnames(all_samples_meth)[4:9] = c("col0_CG","col0_CHG","col0_CHH","met1_CG","met1_CHG","met1_CHH")
meth_ranges = makeGRangesFromDataFrame(all_samples_meth, start.field = "start", end.field = "end", seqnames.field = "chr")
# fix chromosome IDs
levels(meth_ranges@seqnames@values) = substr(as.character(levels(meth_ranges@seqnames@values)),4,4)
meth_ranges@seqinfo@seqnames = levels(meth_ranges@seqnames@values)
gene_meth_olaps = findOverlaps(gene_ranges, meth_ranges)
for (this_col in 4:9) {
gff.genes = cbind(gff.genes, rep(NA, nrow(gff.genes)))
colnames(gff.genes)[11+this_col-3] = colnames(all_samples_meth)[this_col]
for (this_gene in 1:nrow(gff.genes)) {
gff.genes[this_gene, 11+this_col-3] = mean(all_samples_meth[subjectHits(gene_meth_olaps[queryHits(gene_meth_olaps)==this_gene]),this_col], na.rm=TRUE)
}
}
write.table(gff.genes[,c(10,12:17)], file="gene_col0_met1_mC.txt", sep="\t",col.names=TRUE, row.names=FALSE)
library(rtracklayer)
#[3:53 PM] Jaemyung Choi (JIC)
# log2 ratios of output to input DNA; output from deeptools2 bamcompare
col0_H3K9me2 = rtracklayer::import("GSM3465913_col.H3K9me2.w50.bw", format="BigWig")
met1_H3K9me2 = rtracklayer::import("GSM3465914_met1.H3K9me2.w50.bw", format="BigWig")
# fix chromosome IDs
levels(col0_H3K9me2@seqnames@values) = substr(as.character(levels(col0_H3K9me2@seqnames@values)),4,4)
col0_H3K9me2@seqinfo@seqnames = levels(col0_H3K9me2@seqnames@values)
levels(met1_H3K9me2@seqnames@values) = substr(as.character(levels(met1_H3K9me2@seqnames@values)),4,4)
met1_H3K9me2@seqinfo@seqnames = levels(met1_H3K9me2@seqnames@values)
gene_chip_olaps = findOverlaps(gene_ranges, col0_H3K9me2)
gff.genes = cbind(gff.genes, rep(NA, nrow(gff.genes)))
colnames(gff.genes)[ncol(gff.genes)] = "col0_H3K9me2"
for (this_gene in 1:nrow(gff.genes)) {
gff.genes[this_gene, ncol(gff.genes)] = mean(as.data.frame(col0_H3K9me2)[subjectHits(gene_chip_olaps[queryHits(gene_chip_olaps)==this_gene]),"score"], na.rm=TRUE)
}
gene_chip_olaps = findOverlaps(gene_ranges, met1_H3K9me2)
gff.genes = cbind(gff.genes, rep(NA, nrow(gff.genes)))
colnames(gff.genes)[ncol(gff.genes)] = "met1_H3K9me2"
for (this_gene in 1:nrow(gff.genes)) {
gff.genes[this_gene, ncol(gff.genes)] = mean(as.data.frame(met1_H3K9me2)[subjectHits(gene_chip_olaps[queryHits(gene_chip_olaps)==this_gene]),"score"], na.rm=TRUE)
}
write.table(gff.genes[,c(10,18:19)], file="gene_col0_met1_h3k9ME2.txt", sep="\t",col.names=TRUE, row.names=FALSE)