-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGSE94464_raw_counts_DEG_analysis.R
126 lines (104 loc) · 3.97 KB
/
GSE94464_raw_counts_DEG_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Set Working Directory
setwd("C:/Users/comp_/OneDrive/Documents/IOB/tc_samples")
# Install packages if required
install.packages("dplyr")
#load libraries
library(dplyr)
library(dplyr)
library(tidyverse)
library(GEOquery)
#Upload data
data <- read.csv("GSE94464_norm_counts_FPKM_GRCh38.p13_NCBI.tsv", sep = "\t")
dim(data)
head(data)
#get metadata
gse<-getGEO(GEO="GSE94464", GSEMatrix=TRUE)
gse
metadata <- pData(phenoData(gse[[1]]))
head(metadata)
# data processing
# select desired columns
metadata.subset <- select(metadata,c(1,10,11,17))
metadata.modified<-metadata %>%
select(1,10,11,17) %>%
# rename columns
rename(tissue=characteristics_ch1)%>%
rename(metastasis=characteristics_ch1.1)%>%
# remove characters that are not required
mutate(tissue=gsub("cell line: ","", tissue))%>%
mutate(tissue=gsub("Anaplastic thyroid cancer cell line","Anaplastic thyroid cancer", tissue))%>%
mutate(tissue=gsub("papillary thyroid cancer cell line","Papillary thyroid cancer", tissue))%>%
mutate(metastasis=gsub("tissue: ","", metastasis))
# reshaping data
data.long<- data %>%
gather(key="samples",value="FPKM",-GeneID)
# convert metadata index to column so that sample ids can be accessed
metadata.modified$index <- rownames(metadata.modified)
# join data frames = data.long +metadata.modified
data.long <- data.long %>%
left_join(.,metadata.modified,by=c("samples"="index"))
# explore data
head(data.long)
#getting gene names for gene ids
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
BiocManager::install("org.Hs.eg.db")
}
library(org.Hs.eg.db)
data.long$GeneID <- as.character(data.long$GeneID)
data.long$GeneName <- mapIds(org.Hs.eg.db, keys = data.long$GeneID, column = "SYMBOL", keytype = "ENTREZID", multiVals = "first")
head(data.long)
# data analysis
data.long %>%
filter(GeneName=="BRAF"|GeneName=="TP53") %>%
group_by(GeneName, tissue) %>%
summarize(mean_FPKM = mean(FPKM),median_FPKM=median(FPKM))%>%
arrange(mean_FPKM)
#data visualization
# 1.Bar Plot
ba<-data.long %>%
filter(GeneName == "BRAF") %>%
ggplot(.,aes(x=samples,y=FPKM, fill=tissue)) +
geom_col() + theme(legend.position = "bottom")+
ggtitle("Expression of BRAF Gene Across Samples")
ggsave(ba,filename="barplot.png", width=12,height=8)
# 2.Density plot
d<-data.long %>%
filter(GeneName == "BRAF")%>%
ggplot(.,aes(x=FPKM,fill=tissue))+
ggtitle("Distribution of BRAF Gene Expression Across Different Tissues") +
geom_density(alpha=0.5)
ggsave(d,filename="density.png", width=10,height=8)
# 3. Box Plot
b<-data.long %>%
filter(GeneName =="BRAF")%>%
ggplot(.,aes(x=tissue,y=FPKM))+
geom_boxplot()+ theme(legend.position = "bottom")+
ggtitle("Variation in BRAF Gene Expression by Tissue Type")
ggsave(b,filename="boxplot.png", width=12,height=8)
# 4. violin plot
v<-data.long %>%
filter(GeneName =="BRAF")%>%
ggplot(.,aes(x=tissue,y=FPKM,fill=tissue))+
geom_violin()+ theme(legend.position = "bottom")+
ggtitle("Distribution and Density of BRAF Gene Expression Across Tissues")
ggsave(v,filename="violin.png", width=10,height=8)
# remove Gene Id column to avoid getting null values during scatter plotting
data.long <- data.long[, !(names(data.long) %in% "GeneID")]
# 5. Scatter Plot
s<-data.long %>%
filter(GeneName =="BRAF" | GeneName == "NRAS")%>%
spread(key=GeneName, value=FPKM)%>%
ggplot(., aes(x=BRAF, y=NRAS, color=tissue))+
geom_point()+
geom_smooth(method="lm",se=FALSE)+
ggtitle("Correlation Between BRAF and NRAS Gene Expression Across Tissues")
ggsave(s,filename="scatterplot.png", width=10,height=8)
# 6. HeatMap
genes.of.interest <- c("BRAF","TP53","NRAS","TERT","HRAS")
p <- data.long %>%
filter(GeneName %in% genes.of.interest)%>%
ggplot(.,aes(x=samples,y=GeneName, fill=FPKM))+
geom_tile()+
scale_fill_gradient(low = "white",high="red")+
ggtitle("Heatmap of BRAF,TP53,NRAS,TERT,HRAS")
ggsave(p,filename="heatmap.png", width=12,height=8)