figures_and_tables/Figure_5.Rmd

---
---
title: "R Notebook"
output: html_notebook
---
# Define the gene of interest
```{r}
gene_of_interest <- "SLC24A1" # SAMD11, SLC24A1, IMPDH1
```


# 1. Update R and install the necessary packages
```{r}
#install.packages('devtools') #assuming it is not already installed
#install_github('andreacirilloac/updateR')
library(devtools)
library(updateR)
#updateR()()

# Install ggtranscript
# devtools::install_github("dzhang32/ggtranscript")
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("rtracklayer")

library(magrittr)
library(dplyr)
library(ggplot2)
library(ggtranscript)
library(rtracklayer)
library(stringr)
library(ggpubr)
library(tidyr)
```


# Import the gtf 
```{r}
# Import the Isoquant gtf
gtf <- rtracklayer::import("../data/transcriptomics/isoquant/isoquant_aln.sorted.transcript_models.gtf")
gtf <- gtf %>% dplyr::as_tibble()

# Import the CPAT gtf (only novel ORFs)
gtf_cds <- rtracklayer::import("../data/proteomics/isoquant/hnr_50_with_cds_filtered.gtf")
gtf_cds <- gtf_cds %>% dplyr::as_tibble()
```

# Modify the columns of the CDS gtf file
```{r}
gtf_cds[c('gene', 'transcript', 'orf_class', 'CPM')] <- str_split_fixed(gtf_cds$transcript_id, '\\|', 4)

# add a transcript_class columns
gtf_cds <- gtf_cds %>%
  mutate(transcript_class = case_when(
    grepl("NIC", transcript_id) ~ "NIC",
    grepl("NNIC", transcript_id) ~ "NNIC",
    grepl("FSM", transcript_id) ~ "FSM"
    ))
```

# Filter fo the gene of interest
```{r}
# filter your gtf for the gene of interest
goi_annotation_from_gtf <- gtf_cds  %>% 
  dplyr::filter(
    !is.na(gene_id), 
    gene_id == gene_of_interest
  ) 

# extract the required annotation columns
goi_annotation_from_gtf <- goi_annotation_from_gtf %>% 
  dplyr::select(
    seqnames,
    start,
    end,
    strand,
    type,
    gene_id,
    transcript,
    orf_class,
    transcript_class)

```

# Read gencode gtf
```{r}
# Import the CPAT gtf (only novel ORFs)
gencode_path <- paste('../data/ref_genome/', gene_of_interest, '_gencode.gtf', sep='')
gencode <- rtracklayer::import(gencode_path)
gencode <- gencode %>% dplyr::as_tibble()

# extract the required annotation columns
gencode <- gencode %>% 
  dplyr::select(
    seqnames,
    start,
    end,
    strand,
    type,
    gene_name,
    transcript_id,
    transcript_type)

#gencode <- gencode %>% 
#  rename(transcript = transcript_id, gene_id = gene_name)

colnames(gencode)[7] <- 'transcript'
colnames(gencode)[6] <- 'gene_id'

gencode <- gencode  %>% 
  dplyr::filter(
    gene_id == gene_of_interest
  ) 

gencode$orf_class <- 'GENCODE'
gencode$transcript_class<- 'GENCODE'

goi_annotation_from_gtf$transcript_type <- 'protein_coding'


gencode %>% head()
```

# Load ONT ORFs
```{r}

ont_cds <- rtracklayer::import("../data/transcriptomics/nanopore/ont.gtf")
ont_cds <- ont_cds %>% dplyr::as_tibble()

# extract the required annotation columns
ont_cds <- ont_cds %>% 
  dplyr::select(
    seqnames,
    start,
    end,
    strand,
    type,
    gene_id,
    transcript_id)

#ont_cds <- ont_cds %>% 
# rename(transcript = transcript_id)
colnames(ont_cds)[7] <- 'transcript'

ont_cds$orf_class <- 'ONT'
ont_cds$transcript_class <- 'ONT'

ont_cds$transcript_type <- 'protein_coding'

# Filter for gene of interest
switch(gene_of_interest,
       "SLC24A1" = {
         ont_cds <- ont_cds[ont_cds$gene_id %in% c('TRANSCRIPT24.CHR15.NIC_ORF_1'), ]
       },
       "IMPDH1" = {
         ont_cds <- ont_cds[ont_cds$gene_id %in% c('TRANSCRIPT115.CHR7.NNIC_ORF_1'), ]
       },
       "SAMD11" = {
         ont_cds <- ont_cds[ont_cds$gene_id %in% c('TRANSCRIPT79.CHR1.NNIC_ORF_1','TRANSCRIPT30.CHR1.NNIC_ORF_1'),]
       }
)

ont_cds %>% head()


```
 
# Combine the GENCODE, PacBio and ONT transcripts
```{r}

combined <- rbind(gencode,goi_annotation_from_gtf, ont_cds)

goi_combined <- combined  %>% 
  dplyr::filter(
    !is.na(gene_id), 
  ) 

combined
```

#Prepare the data for the ggtranscript plot
```{r}
# extract exons
goi_exons <- goi_combined %>% dplyr::filter(type == "exon")

# obtain cds
goi_cds <- goi_combined %>% dplyr::filter(type == "CDS")

goi_exons_prot_cod <- goi_exons %>%
    dplyr::filter(transcript_type == "protein_coding")

# need to make sure that the CDS definition includes the stop codon
# as the ensembl CDS definition does not include the stop codon
# here, we add 3 base pairs to the end of the the CDS of each transcript
goi_cds_w_stop <- goi_cds %>%
     dplyr::group_by(transcript) %>%
     dplyr::mutate(
         end = ifelse(end == max(end), end + 3, end)
     ) %>%
     dplyr::ungroup()

# add utr adds ranges that represent the utr
goi_cds_utr <- add_utr(
     goi_exons,
     goi_cds,
     group_var = "transcript"
 )

# add utrs can be most useful when combined with shorten_gaps()
goi_cds_utr_rescaled <-
     shorten_gaps(
         exons = goi_cds_utr,
         introns = to_intron(goi_cds_utr, "transcript"),
         group_var = "transcript"
     )
```

# Format the plot
```{r}
formatting <- theme(legend.position = "None", 
      #aspect.ratio = 1,
      panel.border = element_rect(colour = "black", fill = NA, size = 0.5),
      panel.background = element_rect(fill="white"),
      panel.grid.major.x = element_line(size = 0.25, color = "grey"),
      panel.grid.major.y = element_line(size = 0.25, color = "grey"),
      axis.text.x = element_text(color = "black", size = 10),
      axis.text.y = element_text(color = "black", size = 10),
      axis.title.x = element_text(color ="black", size = 10, face="bold"),
      axis.title.y = element_blank(),
      #axis.title.y = element_text(color = "black", size = 18, face="bold"),
      plot.title = element_text(color = "black", size = 10),
      plot.margin = margin(c(0.05,0.2,0.05,0.05), unit="cm"),
      plot.tag = element_text(color ="black", size= 10, face="bold")
)

```

# Prepare the data for the transcript plot
```{r}
# extract exons
goi_exons <- goi_combined %>% dplyr::filter(type == "exon")
# extrac cds
goi_cds <- goi_combined %>% dplyr::filter(type == "CDS")

goi_exons_prot_cod <- goi_exons %>%
    dplyr::filter(transcript_type == "protein_coding")

# We add 3 base pairs to the end of the the CDS of each transcript to make sure that the CDS includes the stop codon
goi_cds_w_stop <- goi_cds %>%
     dplyr::group_by(transcript) %>%
     dplyr::mutate(
         end = ifelse(end == max(end), end + 3, end)
     ) %>%
     dplyr::ungroup()

# add utr adds ranges that represent the utr
goi_cds_utr <- add_utr(
     goi_exons,
     goi_cds,
     group_var = "transcript"
 )

# add utrs can be most useful when combined with shorten_gaps()
goi_cds_utr_rescaled <-
     shorten_gaps(
         exons = goi_cds_utr,
         introns = to_intron(goi_cds_utr, "transcript"),
         group_var = "transcript"
     )

# Add a transcript class column for the coloring and legend
goi_cds_utr_rescaled$transcript_class[goi_cds_utr_rescaled$type == 'UTR' & grepl('TRANSCRIPT', goi_cds_utr_rescaled$transcript) == TRUE] <- 'NIC'
goi_cds_utr_rescaled$transcript_class[goi_cds_utr_rescaled$type == 'UTR' & grepl('ORF', goi_cds_utr_rescaled$transcript) == TRUE] <- 'ONT'
goi_cds_utr_rescaled$transcript_class[goi_cds_utr_rescaled$type == 'UTR' & grepl('ENST', goi_cds_utr_rescaled$transcript) == TRUE] <- 'GENCODE'
```

# Create the transcript plot
```{r}

transcript_plot <- goi_cds_utr_rescaled %>%
     dplyr::filter(type == "CDS") %>%
     ggplot(aes(
         xstart = start,
         xend = end,
         y = transcript
     )) +
     geom_range() +
     geom_range(
         data = goi_cds_utr_rescaled %>% dplyr::filter(type == "UTR"),
         height = 0.25, aes(fill = transcript_class)
     ) +
    geom_range(
        data = goi_cds_utr_rescaled %>% dplyr::filter(type == "CDS"),
        aes(fill = transcript_class)
    ) +
    geom_intron(
         data = to_intron(
             goi_cds_utr_rescaled %>%
                 dplyr::filter(type != "intron"),
             "transcript"
         ),
         arrow.min.intron.length = 200,
         aes(strand = strand)
     )  
        

transcript_plot <- transcript_plot + scale_fill_manual(values = c("#07004D", "#E76F51", "#567568", "#E76F51"))
theme_set(formatting)

#Inverse the direction of IMPDH1
if (gene_of_interest == 'IMPDH1') {
  transcript_plot <- transcript_plot + coord_cartesian(xlim = c(5000, 0))
} 


```


# Add TPM column to the dataframe
```{r}
counts <- read.table("../data/transcriptomics/isoquant/isoquant_aln.sorted.transcript_model_grouped_tpm.tsv",header = FALSE, sep = "\t")
colnames(counts) <- c("transcript", "sample1", "sample2", "sample3") 
# convert transcript name to upper
counts <- counts %>%
  mutate(transcript = toupper(transcript))

cpm <- left_join(goi_cds_utr_rescaled, counts, by = "transcript")

# Assuming df is your original dataframe
selected_columns <- c("transcript", "sample1", "sample2", "sample3")  # List the columns you want to select

cpm <- cpm %>%
  select(all_of(selected_columns))

cpm <- cpm %>% distinct()

# Drop the NA column
cpm <- cpm %>%
  filter(rowSums(!is.na(.)) > 0)

cpm$transcript_class <- rep(NA, nrow(cpm))
cpm$transcript_class[grepl('NIC', cpm$transcript) == TRUE] <- 'NIC'
cpm$transcript_class[grepl('NNIC', cpm$transcript) == TRUE] <- 'NNIC'
cpm$transcript_class[grepl('ONT', cpm$transcript) == TRUE] <- 'ONT'
cpm$transcript_class[grepl('ENST', cpm$transcript) == TRUE] <- 'GENCODE'
```

# Create the CPM boxplot
```{r}
long_data <- cpm %>%
  gather(sample, value, sample1, sample2, sample3)

cpm_plot <- ggplot() + 
  geom_jitter(data = long_data, aes(x = value, y = transcript, color = transcript_class), size = 4, alpha = 0.5) +
  scale_color_manual(values = c("GENCODE" = "#07004D", "NIC" = "#E76F51", "NNIC" = "#E76F51", "ONT" = "#567568"))
  
theme_set(formatting)
```

# Combine the two plots
```{r}
plot <- ggarrange(transcript_plot, 
         cpm_plot + theme(axis.text.y = element_blank(),
         axis.ticks.y = element_blank(),
         axis.title.y = element_blank(),
         axis.title.x = element_blank()),
         ncol = 2, nrow = 1, widths = c(1, 0.2))

if (gene_of_interest == 'SAMD11'){
  h = 3.75
} else if (gene_of_interest == 'SLC24A1'){
  h = 2
} else if (gene_of_interest == 'IMPDH1'){
  h = 3.75
}

# Save the plot
ggsave_path = paste('plots/', gene_of_interest, '_cpm.svg', sep='')
ggsave(
  filename = ggsave_path,
  plot = plot,
  device = NULL,
  path = NULL,
  scale = 1,
  width = 7.25,
  height = h,
  units = "in",
  dpi = 300,
  limitsize = TRUE,
  bg = NULL
)
plot
```