1_Data_import_exploration.Rmd

---
title: "Compound discover data analysis"
author: "Christian Ayala"
output:
  html_document:
    df_print: paged
  html_notebook: default
  pdf_document: default
editor_options:
  chunk_output_type: console
---

# 1. Importing Libraries

```{r libraries, message=FALSE, warning=FALSE}
library(tidyverse)
library(readxl)
library(ggpubr)
library(ggsci)
library(ggpolypath)
library(venn)
source('functions_cdis_exploration.R')
```

# 2. Import data

```{r set_path, message=FALSE}
#set path variables
project_dir <- getwd()

# Give a name for your project so the results are all grouped together in a directory with the same name
project_name <- 'Bog_1e5_label'
figures_dir <- file.path(project_dir, paste0(project_name, '_output_figures'))
tables_dir <- file.path(project_dir,  paste0(project_name, '_output_tables'))

# Create output directories
dir.create(figures_dir, showWarnings = FALSE)

dir.create(tables_dir, showWarnings = FALSE)
```

Set if the data to be used is going to be labeled or unlabeled

```{r}
# Flag for labeled / unlabeled data, set TRUE or FALSE

label = TRUE
```


The data for in this script is the **Compounds table** table exported from **Compound Discoverer**. For this script to work well please add the **Area** columns to this table, and the **Gap Status**.
The **metadata table** can be exported directly from the **Samples tab** in **Compound Discoverer** by copying to a blank text file.


```{r import_data, message=FALSE}
# Import data tables
cd_results_file <- file.path(project_dir, 'data', 'Bog_lbl_unlbl_1e5_formula.xlsx')
cd_results_table <- read_xlsx(cd_results_file)

# Select columns needed for downstream analysis

cd_results_table <- cd_results_table %>%
  arrange(desc(`Molecular Weight`)) %>% 
  mutate(FeatureID = paste0('Feature',formatC(n():0001, width = 4, flag = '0'))) %>%
  select(FeatureID, Name, Formula, `Molecular Weight`, contains('Annotation source'), contains('Results'), contains('Pathways'), 
         contains('Area:'), contains('Gap Status:'), contains('Labeling Status:')) %>% 
  # Differentiate between features that share the same name using "peak#" at the end of the name
  group_by(Name) %>% 
  add_count(Name) %>% 
  
  # Create variable with names for plotting (useful in following scripts)
  mutate(name4plot = ifelse(is.na(Name), FeatureID, ifelse(n == 1, Name, paste0(Name, '-peak', n():1)))) %>% 
  select(-n) %>% 
  ungroup()

# Import metadata and fix names
metadata_file <- file.path(project_dir, 'data', 'metadata.tsv')
metadata <- read_tsv(metadata_file)

metadata$`Sample Identifier` <- str_remove(metadata$`Sample Identifier`, 'Saleska_')
metadata$`Sample Identifier` <- str_remove(metadata$`Sample Identifier`, '_23.*')

# Select only the useful columns and fix column names

metadata <- metadata %>% 
  select(-Error, -Sample, -File) %>% 
  rename(SampleID = `Sample Identifier`, Label = `Sample Type`)

table_file <- file.path(tables_dir, 'fixed_metadata.csv')
write_csv(metadata, table_file)
```

Get the label status of each feature (labeled analysis only)

```{r}
if(label == TRUE){
  label_status <- cd_results_table %>% 
    select(FeatureID, contains('Labeling Status:'))
  
  colnames(label_status) <- str_remove(colnames(label_status), 'Saleska_')
  colnames(label_status) <- str_remove(colnames(label_status), '_23.*')
  
  label_status[label_status == 'Contaminating mass' | label_status == 'Low pattern fit' | label_status == 'Not detected'] <- 'Unlabeled'
  label_status[label_status == 'No warnings' | label_status == 'Irregular exchange' ] <- 'Labeled'
  
  table_file <- file.path(tables_dir, 'label_status.csv')
  write_csv(label_status, table_file)
}
```



# 3. Data Manipulation and thermodynamic indices calculations

```{r data, message=FALSE, warning=FALSE}

# Split formula column into elemental counts
cd_results_table <- separate_formula(cd_results_table)

# Calculate ratios and thermodynamic indices
cd_results_table <- calc_ratios_n_idxs(cd_results_table)

# Calculate classes
cd_results_table <- calc_classes(cd_results_table)

# Gather area under the curve (AUC) values per sample

compounds_table <- cd_results_table %>% 
  select(-contains('Gap Status:'), -contains('Labeling Status:')) %>% 
  gather(contains('Area:'), key = 'SampleID', value = 'AUC') %>% 
  filter(AUC > 0)
compounds_table$SampleID <- str_remove(compounds_table$SampleID, 'Area: Saleska_')
compounds_table$SampleID <- str_remove(compounds_table$SampleID, '_23.*')

# Save gap-filled table to be used in Statistical Analysis

table_file <- file.path(tables_dir, 'gap_filled_compounds_table.csv')
write_csv(compounds_table, table_file)

```

### Gap filtering (only for unlabel data)

```{r}

# Gather "Gap Status" for filtering 
if(label == FALSE){
  gap_status <- cd_results_table %>% 
    select(FeatureID, contains('Gap Status:')) %>% 
    gather(contains('Gap Status:'), key = 'SampleID', value = 'gap_status')
  gap_status$SampleID <- str_remove(gap_status$SampleID, 'Gap Status: Saleska_')
  gap_status$SampleID <- str_remove(gap_status$SampleID, '_23.*')
  
  ## Filtering
  
  compounds_table <- left_join(compounds_table, gap_status, by = c('FeatureID', 'SampleID')) %>% 
    filter(gap_status != 'Full gap')
}

```


``` {r}
# Add metadata information

compounds_table <- left_join(compounds_table, metadata, by = 'SampleID')

table_file <- file.path(tables_dir, 'compounds_table.csv')
write_csv(compounds_table, table_file)

```

# 4. Summary plots

## 4.1. Number of identified compounds

This figure shows the number of compounds that were detected in each file, and how many of them were assigned a structure through database searches

```{r n_identified_masses, message=FALSE}
# Create a new tibble to count the number of compounds per file
summary_table <- compounds_table %>% 
  select(SampleID, Name)

# Compounds with names are the ones that have an assigned structure

summary_table$Name <- ifelse(is.na(summary_table$Name), 'No structure', 'With structure')

summary_table <- summary_table %>% 
  group_by(SampleID) %>% 
  count(Name)

colnames(summary_table) <- c('SampleID', 'Status', 'n')

# Join summary table with metadata information

summary_table <- left_join(summary_table, metadata, by = c('SampleID'))

summary_table$Status <- factor(summary_table$Status, levels = c('No structure', 'With structure'))

id_summary_plot <- plot_col(summary_table, SampleID, n, Status) +
  labs(title = 'Number of masses found',
       x = 'Sample name',
       y = 'Count') +
  theme(plot.title = element_text(face = 'bold', hjust = 0.5),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1))

id_summary_plot

figure_file <- file.path(figures_dir, 'id_summary.png')
ggsave(figure_file, id_summary_plot, dpi = 300)

```

## 4.2. Venn Diagrams of number of detected compouns

Only by time

```{r venn_time, message=FALSE}

# Venn diagram of the compounds present in each time
T0_list <- compounds_table %>% 
  filter(Time == 'T0') %>% 
  select(FeatureID) %>% 
  distinct()

T1_list <- compounds_table %>% 
  filter(Time == 'T1') %>% 
  select(FeatureID) %>% 
  distinct()

T2_list <- compounds_table %>% 
  filter(Time == 'T2') %>% 
  select(FeatureID) %>% 
  distinct()

T3_list <- compounds_table %>% 
  filter(Time == 'T3') %>% 
  select(FeatureID) %>% 
  distinct()

my_list <- list(T0 = T0_list$FeatureID,
                T1 = T1_list$FeatureID,
                T2 = T2_list$FeatureID,
                T3 = T3_list$FeatureID)

my_colors <- c('blue', 'yellow', 'gray', 'red')

figure_file <- file.path(figures_dir, 'venn_time.png')
png(figure_file)
venn_time <- plot_venn(my_list, my_colors)
dev.off()

venn_time <- plot_venn(my_list, my_colors)
```

## 4.3 KEGG Pathways

```{r}
# Top KEGG pathways identified

kegg <- compounds_table %>% 
  select(colnames(metadata), `KEGG Pathways`) %>% 
  separate_rows(`KEGG Pathways`, sep = ';') %>% 
  group_by(Time, Label) %>% 
  count(`KEGG Pathways`) %>% 
  drop_na(`KEGG Pathways`)

kegg_plot <- plot_col(kegg, n, `KEGG Pathways`, Time, dodge = TRUE) +
  facet_wrap(~ Label)
kegg_plot

figure_file <- file.path(figures_dir, 'kegg_pathways.jpg')
ggsave(figure_file, kegg_plot, dpi = 300)

```

## 4.4 Metabolika Pathways

```{r}
# Top Metabolika pathways identified

metabolika <- compounds_table %>% 
  select(colnames(metadata), `Metabolika Pathways`) %>% 
  separate_rows(`Metabolika Pathways`, sep = ';') %>% 
  group_by(Time, Label) %>% 
  count(`Metabolika Pathways`) %>% 
  drop_na(`Metabolika Pathways`)

metabolika$`Metabolika Pathways` <- str_remove(metabolika$`Metabolika Pathways`, '\\(.*\\)')

metabolika_plot <- plot_col(metabolika, n, `Metabolika Pathways`, Time, dodge = TRUE) +
  facet_wrap(~ Label)
metabolika_plot

figure_file <- file.path(figures_dir, 'metabolika_pathways.jpg')
ggsave(figure_file, metabolika_plot, dpi = 300)

```

# 5. Exploratory plots

## 5.1. Van Krevelen Diagrams

Van Krevelen diagrams faceted by different parameters


```{r vank_plot_time, message=FALSE}

# Ven Krevelen Diagram based on time
vank_time <- plot_vank(compounds_table, Class, Time)
vank_time

figure_file <- file.path(figures_dir, 'Venk_diagram_time.jpg')
ggsave(figure_file, vank_time, dpi = 300)

```

```{r vank_material, message=FALSE}

# Ven Krevelen Diagram based on type of organic material
vank_material <- plot_vank(compounds_table, Class, Material)
vank_material

figure_file <- file.path(figures_dir, 'Venk_diagram_material.jpg')
ggsave(figure_file, vank_material, dpi = 300)

```

## 6.1 Gibbs Free Energy (GFE) exploratory plots

Boxplot of GFE at different timepoints

```{r gfe_box_time, message=FALSE}

# Boxplot of Gibbs Free Energy based on time
my_comparisons <- list(c('T0', 'T1'), c('T0', 'T2'), c('T0', 'T3'))

gfe_time <- plot_boxplot(compounds_table, Time, GFE, Time, my_comparisons) +
  labs(title = 'Gibbs Free Energy') +
  facet_wrap(~Label)
gfe_time

figure_file <- file.path(figures_dir, 'gfe_time.png')
ggsave(figure_file, gfe_time, dpi = 300)
```

Density plot of Gibbs Free Energy per timepoint among labeled and unlabeled samples

```{r gfe_density, message=FALSE}
gfe_density <- plot_density(compounds_table, GFE, Time, Time, Label) +
  labs(title = 'Gibbs Free Energy')
  
gfe_density

figure_file <- file.path(figures_dir, 'gfe_density.png')
ggsave(figure_file, gfe_density, dpi = 300)

```

## 6.1 Aromaticity Index (AI) exploratory plots

```{r AI_density, message=FALSE}

ai_density <- plot_density(compounds_table, AI, Time, Time, Label) +
  labs(title = 'Aromaticity Index')
  
ai_density

figure_file <- file.path(figures_dir, 'ai_density.png')
ggsave(figure_file, ai_density, dpi = 300)
```