-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path9.TopGO-analysis.R
149 lines (118 loc) · 6.03 KB
/
9.TopGO-analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
####TopGO-Analysis ####
#Preparation
library(topGO)
library(pgirmess)
library(xlsx)
library(dplyr)
library(tidyr)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
setwd("~/GO-Analysis")
geneID2GO <- (readMappings("GO_Map"))
K27posGenes<-scan(
"Chrom_FeatureOverlap/H3K27me3+_genes.txt",
quiet=TRUE,
what=""
)
RY=c("cluster_1", "cluster_2", "cluster_3", "cluster_4") #Only contains clusters with high coverage
Telobox=c("cluster_1", "cluster_2", "cluster_3", "cluster_4")
Telolike=c("cluster_1", "cluster_2", "cluster_3", "cluster_4")
Gene <- list(K27posGenes)
Genebackground<-c("K27+_Genes")
TargetGeneClusters=list.files(pattern="\\GenesinClusters\\.txt$", recursive=TRUE) ##filename used from 8. GO_Preparation
#### Calculations ####
for (i in 1:length(TargetGeneClusters)) {
unfilteredTargetGenes <- read.table(TargetGeneClusters[[i]],sep="\t",header=TRUE, col.names = c("gene", "cluster")) ##file used from 8. GO_Preparation
splitfilename=strsplit(TargetGeneClusters[[i]], '[-_|.]')[[1]]
teloclustername=paste0(splitfilename[1],"_",splitfilename[2])
if ( splitfilename[1] == "RY" ) {
TargetGenes <- unfilteredTargetGenes %>% filter(cluster %in% RY)
} else if ( splitfilename[1] == "Telobox" | splitfilename[1] == "telobox") {
TargetGenes <- unfilteredTargetGenes %>% filter(cluster %in% Telobox)
} else if ( splitfilename[1] == "Telolike" | splitfilename[1] == "telolike" ) {
TargetGenes <- unfilteredTargetGenes %>% filter(cluster %in% Telolike)
}
####predefined list of interesting genes
for (g in 1) {
geneNames <- Gene[[g]] #names(geneID2GO)
myInterestingGenes <- c(TargetGenes$gene)
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
##store all data in sampleGOdata to faciliate access to identifiers, annotation
##and basic data statistics
sampleMFGOdata <- new("topGOdata", ontology = "MF",
allGenes = geneList, annot = annFUN.gene2GO,
gene2GO=geneID2GO)
sampleCCGOdata <- new("topGOdata", ontology = "CC",
allGenes = geneList, annot = annFUN.gene2GO,
gene2GO=geneID2GO)
sampleBPGOdata <- new("topGOdata", ontology = "BP",
allGenes = geneList, annot = annFUN.gene2GO,
gene2GO=geneID2GO)
###########Fisher-Test
#Selecting 'algorithm=classic' means that the GO hierarchy isn't taken into account,
#so each GO term is tested independently (over-representation enrichment).
#limitation: all genes annotated to a GO term will be automatically annotated to its parents as well
#therefore a GO term might look enriched just because its children are enriched.
#Thus, it is important that GO hierarchy is taken into account (conditional enrichment) to avoid redundancy.
resultFisMF <- runTest(sampleMFGOdata, algorithm='weight01', statistic = "fisher")
resultFisCC <- runTest(sampleCCGOdata, algorithm='weight01', statistic = "fisher")
resultFisBP <- runTest(sampleBPGOdata, algorithm='weight01', statistic = "fisher")
######## CreateTable######
concepts <- list("BP"=resultFisBP, "CC"=resultFisCC, "MF"=resultFisMF)
for (a in 1:length(concepts)) {
Bioconcept = concepts[[a]]
ABR = names(concepts[a])
if ( ABR == "MF" ) {
GOdata = sampleMFGOdata
} else if ( ABR == "CC" ) {
GOdata = sampleCCGOdata
} else if ( ABR == "BP" ) {
GOdata = sampleBPGOdata
}
###Calculate FDR
allGO = usedGO(object = GOdata)
score <- data.frame(GO.ID=names(score(Bioconcept)),
pvalue=score(Bioconcept), row.names=NULL)
allRes <-GenTable(GOdata, weightFisher=Bioconcept, orderBy='weightFisher', topNodes = length(allGO))
allRes <- left_join(allRes, score, by = "GO.ID")
p.adj=p.adjust(allRes$pvalue, method="BH",n=length(allRes$pvalue))
allRes$p.adj=p.adj
#get list of significant GO after multiple testing correction 0.05)
results.BH=allRes[which(allRes$p.adj <= 0.05),]
#get list of significant GO before multiple testing correction
results.p <- allRes[which(allRes$pvalue<=0.05),]
#create columns for table
GO_biological_process=paste0(results.BH$Term,"(",results.BH$GO.ID,")")
GO_biological_process=gsub(" ", "_", GO_biological_process, fixed = TRUE)
Fold_enrichment = results.BH$Significant/results.BH$Expected
FDR=results.BH$p.adj
Gene_number=results.BH$Significant
Group=rep(teloclustername,times=nrow(results.BH))
##export table
write.xlsx(allRes, file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".xlsx"), sheetName="Full_Results")
print ("write xlsx done")
sink (file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".txt"))
print(Bioconcept)
sink()
if ( nrow(results.p) == 0) {
if (nrow(results.BH) == 0) {
next
} else {
table=data.frame(GO_biological_process, Gene_number, Fold_enrichment, FDR)
write.xlsx(results.BH, file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".xlsx"), sheetName="BH_results", append=TRUE)
write.xlsx(table, file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".xlsx"), sheetName="FDR<0.01", append=TRUE )
}
} else {
write.xlsx(results.p, file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".xlsx"), sheetName="pvalue_results",append=TRUE)
if (nrow(results.BH) == 0) {
next
} else {
table=data.frame(GO_biological_process, Gene_number, Fold_enrichment, FDR)
write.xlsx(results.BH, file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".xlsx"), sheetName="BH_results", append=TRUE)
write.xlsx(table, file = paste0(ABR,"_",teloclustername,"_",Genebackground[g],".xlsx"), sheetName="FDR<0.01", append=TRUE )
}
}
}
}
}