This notebook looks at cell type annotations and gene set expression across all samples in SCPCP00015 and assigns “final” annotations. To do this we are using the merged, but not batch-corrected, object containing the gene expression data for all samples.

The following input is used:

Setup

suppressPackageStartupMessages({
  # load required packages
  library(SingleCellExperiment)
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)

# set seed
set.seed(2024)

# quiet messages
options(readr.show_col_types = FALSE)
ComplexHeatmap::ht_opt(message = FALSE)
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "results", "merge-sce", "SCPCP000015")

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-ewings") 
# path to sce 
sce_file <- file.path(data_dir, "SCPCP000015_merged.rds")

# path to workflow results
workflow_results_dir <- file.path(module_base, "results")

singler_results_dir <- file.path(workflow_results_dir, "aucell_singler_annotation")
singler_results_files <- list.files(singler_results_dir, pattern = "*singler-classifications.tsv", full.names = TRUE, recursive = TRUE)
library_ids <- stringr::str_remove(basename(singler_results_files), "_singler-classifications.tsv")


aucell_results_dir <- file.path(workflow_results_dir, "aucell-ews-signatures")
aucell_results_file <- file.path(aucell_results_dir, "SCPCP000015_auc-ews-gene-signatures.tsv")

consensus_results_dir <- file.path(repository_base, "analyses", "cell-type-consensus", "results", "cell-type-consensus", "SCPCP000015")
consensus_results_files <- list.files(consensus_results_dir, pattern = "*_consensus-cell-type-assignments.tsv.gz", full.names = TRUE, recursive = TRUE)

# small gene sets
visser_marker_genes_file <- file.path(module_base, "references", "visser-all-marker-genes.tsv")
cell_state_genes_file <- file.path(module_base, "references", "tumor-cell-state-markers.tsv")

# marker genes to be used for validating assignments 
validation_markers_file <- file.path(module_base, "references", "combined-validation-markers.tsv")
# output file to save final annotations 
results_dir <- file.path(module_base, "results", "final-annotations")
fs::dir_create(results_dir)
output_file <- file.path(results_dir, "SCPCP000015_celltype-annotations.tsv.gz")
# source in setup functions prep_results()
setup_functions <- file.path(module_base, "template_notebooks", "utils", "setup-functions.R")
source(setup_functions)

# source in validation functions 
# calculate_mean_markers(), plot_faceted_umap()
validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
source(validation_functions)

# source in plotting functions 
# expression_umap(), cluster_density_plot(), and annotated_exp_heatmap()
plotting_functions <- file.path(module_base, "template_notebooks", "utils", "plotting-functions.R")
source(plotting_functions)

# source jaccard functions plot_jaccard()
jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
source(jaccard_functions)
stopifnot(
  "sce file does not exist" = file.exists(sce_file),
  "aucell results file does not exist" = file.exists(aucell_results_file), 
  "at least one singler file is missing" = all(file.exists(singler_results_files)),
  "at least one consensus file is missing" = all(file.exists(consensus_results_files))
)
# read in sce
sce <- readr::read_rds(sce_file)

# read in workflow results
singler_df <- singler_results_files |> 
  purrr::set_names(library_ids) |>
  purrr::map(readr::read_tsv) |> 
  dplyr::bind_rows(.id = "library_id")

aucell_df <- readr::read_tsv(aucell_results_file) |> 
  tidyr::separate(barcodes, "-", into = c("library_id", "barcodes"))

# consensus cell types 
consensus_df <- consensus_results_files |> 
  purrr::map(readr::read_tsv) |> 
  dplyr::bind_rows()

# read in marker genes and combine into one list 
visser_markers_df <- readr::read_tsv(visser_marker_genes_file) |> 
  dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> 
  unique()
  
cell_state_markers_df <- readr::read_tsv(cell_state_genes_file) |> 
  dplyr::select(cell_type = cell_state, ensembl_gene_id, gene_symbol)

all_markers_df <- dplyr::bind_rows(list(visser_markers_df, cell_state_markers_df))

Prepare data for plotting

all_results_df <- prep_results(
  sce, 
  singler_df = singler_df, 
  cluster_df = NULL, 
  aucell_df = aucell_df,
  consensus_df = consensus_df,
  cluster_nn = params$cluster_nn,
  cluster_res = params$cluster_res,
  join_columns = c("barcodes", "library_id")
  ) |>
  dplyr::mutate(barcodes = glue::glue("{library_id}-{barcodes}"))
  
cell_types <- unique(all_markers_df$cell_type)

# get the mean expression of all genes for each cell state
gene_exp_df <- cell_types |>
  purrr::map(\(type){
    calculate_mean_markers(all_markers_df, sce, type, cell_type)
  }) |>
  purrr::reduce(dplyr::inner_join, by = "barcodes")

all_info_df <- all_results_df |> 
  dplyr::left_join(gene_exp_df, by = "barcodes") |> 
  dplyr::mutate(
    singler_annotation = dplyr::if_else(is.na(singler_annotation), "unknown", singler_annotation)
  )

Combine classifications from SingleR and consensus cell types

The first thing we will do is compare cell type annotations obtained by each method, SingleR with tumor cells as a reference (output by aucell-singler-annotation.sh) and consensus annotations (output from assign-consensus-celltypes.sh in cell-type-consensus.

Let’s see how similar these annotations are and see if we can create a combined annotation. Note that SingleR will label tumor cells but the consensus cell types only label normal cells. Consensus cell types are observed when cell types from SingleR and CellAssign (using only normal tissue references) share a common ancestor. If no consensus is found, the cells are labeled with “Unknown”.

Any cells that are unable to be labeled via consensus cell types are labeled as “Unknown” and I expect these will line up with cells labeled as tumor cells by SingleR.

# get cell types that have at least 50 cells
singler_to_keep <- all_info_df |> 
  dplyr::count(singler_annotation) |>
  dplyr::filter(n >= 50) |> 
  dplyr::pull(singler_annotation)

# first get jaccard index between consensus and singler
jaccard_mtx <- all_info_df |> 
  dplyr::filter(singler_annotation %in% singler_to_keep) |> 
  make_jaccard_matrix(
    "consensus_annotation",
    "singler_annotation"
  )

ComplexHeatmap::Heatmap(
  t(jaccard_mtx),
  col = circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue")),
  border = TRUE,
  ## Row parameters
  cluster_rows = TRUE,
  row_title = "SingleR with tumor reference",
  row_title_gp = grid::gpar(fontsize = 12),
  row_title_side = "left",
  row_names_side = "left",
  row_dend_side = "right",
  row_names_gp = grid::gpar(fontsize = 12),
  ## Column parameters
  cluster_columns = TRUE,
  column_title = "Consensus",
  column_title_gp = grid::gpar(fontsize = 12),
  column_names_side = "bottom",
  column_names_gp = grid::gpar(fontsize = 12),
  column_names_rot = 90,
  ## Legend parameters
  heatmap_legend_param = list(
    title = "Jaccard index",
    direction = "vertical",
    legend_width = unit(1.5, "in")
  ))

It looks like generally these two cell type annotations are in agreement. We also see that most of the “Unknown” cells as denoted by consensus annotations are in fact lining up with tumor cells. It looks like some tumor cells labeled by SingleR are labeled as “smooth muscle cells” in the consensus cell types, so that’s something to keep in mind.

Let’s create a combined annotation that labels cells as tumor if the cell type label is tumor in SingleR and Unknown in consensus labels. All other labels will be taken from the consensus cell types.

# classify based on combined consensus and singler annotation 
all_info_df <- all_info_df |> 
  dplyr::mutate(combined_annotation = dplyr::if_else(
    consensus_annotation == "Unknown" & singler_annotation == "tumor",
    "tumor",
    consensus_annotation
  ),
  # lump for plotting later 
  combined_lumped = combined_annotation |>
    forcats::fct_lump_n(9, other_level = "All remaining cell types", ties.method = "first") |>
    forcats::fct_infreq() |> 
    forcats::fct_relevel("All remaining cell types", after = Inf)
  )

all_info_df |> 
  dplyr::count(combined_annotation) |> 
  dplyr::arrange(desc(n))

Use AUCell and mean gene set scores to refine tumor cells and identify tumor cell states

We used AUCell with gene sets that we expect to be up and down regulated in tumor cells. We can look at the AUC values for these gene sets across all the cell types and see if there are any cells that should be classified as tumor that are not. Additionally, we have gene sets that can help us distinguish EWS high and EWS low tumor cells.

First we’ll look at the mean AUC value for each gene set across all cell types.

Below is some information for the gene sets:

# reformat auc data for plots
auc_df <- all_info_df |> 
  tidyr::pivot_longer(starts_with("auc_"), names_to = "geneset", values_to = "auc") |> 
  dplyr::mutate(
    geneset = stringr::str_remove(geneset, "auc_")
  ) 

# create mtx with mean auc for heatmap 
auc_mtx <- auc_df |> 
  dplyr::group_by(combined_annotation, geneset) |> 
  dplyr::summarize(
    mean_auc = mean(auc)
  ) |> 
  tidyr::pivot_wider(names_from = geneset, 
                     values_from = mean_auc) |> 
  tidyr::drop_na() |> 
  tibble::column_to_rownames("combined_annotation") |>
  as.matrix()
`summarise()` has grouped output by 'combined_annotation'. You can override using the `.groups` argument.
ComplexHeatmap::Heatmap(
    auc_mtx,
    # set the color scale based on min and max values
    col = circlize::colorRamp2(seq(min(auc_mtx), max(auc_mtx), length = 2), colors = c("white", "#00274C")),
    border = TRUE,
    ## Row parameters
    cluster_rows = TRUE,
    row_title = "Combined cell types",
    row_title_side = "left",
    row_names_side = "left",
    row_dend_side = "right",
    row_names_gp = grid::gpar(fontsize = 10),
    ## Column parameters
    column_title = "MSigDB gene set",
    cluster_columns = TRUE,
    show_column_names = TRUE,
    column_names_gp = grid::gpar(fontsize = 8),
    heatmap_legend_param = list(
      title = "Mean expression"
    )
  )

Because we have some normal cell types that are showing gene signatures expected to be specific to tumor cells, we should use the AUC values to help pull out any remaining cells that should be tumor cells. We can also use the up and down gene sets to distinguish EWS-high and EWS-low tumor cells.

Let’s pick some gene sets where we see variation across cell types that we can use to define cutoffs:

genesets_of_interest <- c(
  "auc_aynaud-ews-targets",
  "auc_staege",
  "auc_wrenn-nt5e-genes",
  "auc_hallmark_EMT"
)

cluster_density_plot(all_info_df, genesets_of_interest, "combined_lumped", "AUC")

We can use these plots to help define cutoffs of AUC values to use for re-labeling any normal cells that should be tumor cells. We expect that the endothelial cells and macrophages have low AUC values so we can use these cell types as the background. Any AUC values above that background should be classified as tumor.

With this in mind, let’s use these cutoffs to define tumor cells:

Let’s re-plot the density plots adding in a line for the AUC cutoffs we have chosen.

cutoffs = c(
  "auc_aynaud-ews-targets" = 0.04,
  "auc_staege" = 0.01,
  "auc_wrenn-nt5e-genes" = 0.1,
  "auc_hallmark_EMT" = 0.05
)


cutoffs |>
    purrr::imap(\(cutoff, column){
      plot_density(
        all_info_df,
        column,
        "combined_lumped"
      ) +
        theme(text = element_text(size = 8)) +
        geom_vline(xintercept = cutoff)
    }) |>
    patchwork::wrap_plots(ncol = 2)

I don’t think these cutoffs are perfect, but I think they are reasonable. Any cells that are to the right of these cutoffs will be classified as tumor.

We can also use these to further stratify tumor cells into EWS high and EWS low classes.

# top cell type order 
density_plot_order <- c(
  "tumor EWS-high",
  "tumor EWS-low", 
  "Unknown",
  "smooth muscle cell",
  "endothelial cell",
  "macrophage",
  "fibroblast",
  "muscle cell",
  "mature T cell",
  "All remaining cell types"
)

# define "final.final" cell types 
all_info_df <- all_info_df |> 
  dplyr::mutate(
    final_annotation = dplyr::case_when(
      `auc_wrenn-nt5e-genes` > 0.1 & auc_hallmark_EMT > 0.05 ~ "tumor EWS-low",
      `auc_aynaud-ews-targets` > 0.04 & auc_staege > 0.01 ~ "tumor EWS-high",
      .default = consensus_annotation
    ),
    # lump together for easier plotting
    final_lumped = final_annotation |>
      forcats::fct_lump_n(9, other_level = "All remaining cell types", ties.method = "first") |>
      forcats::fct_infreq() |>
      forcats::fct_relevel(density_plot_order)
  )

all_info_df |> 
  dplyr::count(final_annotation) |> 
  dplyr::arrange(desc(n))

Let’s remake the density plot to see if we have pulled out the tumor cells from the normal cell types like muscle cells.

cluster_density_plot(all_info_df, genesets_of_interest, "final_lumped", "AUC")

It looks like generally the tumor EWS-high cells show high expression of both upregulated gene sets and tumor EWS-low cells show high expression of both downregulated gene sets, so I feel good about this!

I will also note that there are some muscle cells that show slight expression of aynaud-ews-targets, but show little to no expression of staege targets. I’m still a little worried these are actually tumor cells, but because they only show expression of one of the genesets, we are probably okay to keep them as is noting that the expression profile of muscle cells is very similar to those of tumor cells, this being sarcoma.

Identifying proliferative cells

The last tumor cell state that we want to identify is if any cells can be considered proliferative using the set of proliferative markers in tumor-cell-state-markers.tsv. Let’s look at the mean expression of these markers across all cells. Since we are plotting the proliferative markers, let’s also look at our other custom marker gene sets.

# get columns with mean expression of custom gene sets 
mean_exp_columns <- colnames(all_info_df)[which(endsWith(colnames(all_info_df), "_mean"))]
# make sure order is defined by cell type order
all_info_df <- all_info_df |> 
  dplyr::arrange(final_lumped)

single_annotation_heatmap(
  all_info_df, 
  exp_columns = mean_exp_columns, 
  cell_type_column = "final_lumped", 
  legend_title = "AUC"
)

It looks like we do have a set of cells that are tumor EWS-high and also have high expression of proliferative markers. Because of this, let’s label any cells that are tumor EWS-high and have mean expression of proliferative markers > 0 as tumor EWS-high proliferative.

# set desired order for plotting final annotations later
cell_type_order <- c(
  "tumor EWS-high",
  "tumor EWS-high proliferative",
  "tumor EWS-low",
  "muscle cell", 
  "smooth muscle cell",
  "fibroblast",
  "endothelial cell",
  "macrophage",
  "mature T cell",
  "Unknown",
  "All remaining cell types"
)

all_info_df <- all_info_df |> 
  dplyr::mutate(
    final_annotation = dplyr::if_else(
      final_annotation == "tumor EWS-high" & proliferative_mean > 0,
      "tumor EWS-high proliferative",
      final_annotation
    ), 
    final_lumped = final_annotation |>
      forcats::fct_lump_n(10, other_level = "All remaining cell types", ties.method = "first") |>
      forcats::fct_infreq() |>
      forcats::fct_relevel(cell_type_order)
  )

# first get tally for how many libraries each cell type is present in 
total_libraries <- all_info_df |> 
  dplyr::select(final_annotation, library_id) |> 
  unique() |> # get rid of duplicate cells
  dplyr::group_by(final_annotation) |> 
  dplyr::summarize(
    total_libraries = length(library_id)) |> 
  dplyr::arrange(desc(total_libraries))

# print out both total cells and total libraries 
all_info_df |> 
  dplyr::count(final_annotation, name = "total_cells") |>
  dplyr::left_join(total_libraries, by = "final_annotation") |> 
  dplyr::arrange(desc(total_cells))

For plotting purposes, we will only show the top 10 cell types with muscle cell being the last cell type. It looks like all other cell types are only present in 1-3 libraries. For any cells where there’s only 1 cell we may want to confirm that those cells express the expected targets, but that is outside the scope of this notebook.

Validation of cell type assignments

In the following section we’ll show some plots to help validate that we are content with our cell type assignments. These plots will look specifically at expression of a set of key marker genes that we expect to up in the assigned cell types. These markers are found in references/combined-markers.tsv and include a smaller set of markers for each of the cell types we have identified.

# read in file with validation markers 
validation_markers_df <- readr::read_tsv(validation_markers_file)

# pull out list of genes 
genes <- validation_markers_df |>
  dplyr::pull(ensembl_gene_id)

# get individual gene counts for all marker genes 
gene_cts <- logcounts(sce[genes, ]) |>
  as.matrix() |>
  t() |> 
  as.data.frame()
gene_cts$barcodes <- rownames(gene_cts)

# get all unique cell types from the final lumped group 
celltypes_df <- all_info_df |> 
  dplyr::select(barcodes, final_lumped)

# create a df that has gene expression column and column indicating whether or not the gene is detected in that cell 
genes_df <- gene_cts |> 
  tidyr::pivot_longer(!barcodes, names_to = "ensembl_gene_id", values_to = "gene_exp") |> 
  dplyr::left_join(celltypes_df, by = c("barcodes")) |> 
  dplyr::left_join(validation_markers_df, by = c("ensembl_gene_id")) |> 
  dplyr::rowwise() |> 
  dplyr::mutate(
    detected = gene_exp > 0 # column indicating if gene is present or not
  )

# get total number of cells per final annotation group 
total_cells_df <- genes_df |> 
  dplyr::select(barcodes, final_lumped) |> 
  unique() |> 
  dplyr::count(final_lumped, name = "total_cells")

# get total number of cells each gene is detected in per group 
# and mean gene expression per group 
group_stats_df <- genes_df |> 
  dplyr::group_by(final_lumped, ensembl_gene_id) |>
  dplyr::summarize(
    detected_count = sum(detected),
    mean_exp = mean(gene_exp)
  )
`summarise()` has grouped output by 'final_lumped'. You can override using the `.groups` argument.
gene_summary_df <- genes_df |> 
  # add total cells
  dplyr::left_join(total_cells_df, by = c("final_lumped")) |> 
  # add per gene stats
  dplyr::left_join(group_stats_df, by = c("ensembl_gene_id", "final_lumped")) |> 
  dplyr::select(gene_symbol, final_lumped, cell_type, total_cells, detected_count, mean_exp) |> 
  unique() |>
  dplyr::rowwise() |> 
  dplyr::mutate(
    # get total percent
    percent_exp = (detected_count/total_cells) * 100,
    # order genes based on cell type they indicate
    gene_symbol = factor(gene_symbol, levels = validation_markers_df$gene_symbol),
    final_lumped = forcats::fct_relevel(final_lumped, cell_type_order)
    
  ) 
# filter out low expressed genes
dotplot_df <- gene_summary_df |> 
  dplyr::filter(mean_exp > 0, percent_exp > 10) |>
  dplyr::arrange(final_lumped) |> 
  dplyr::mutate(y_label = as.factor(glue::glue("{final_lumped} ({total_cells})")))


dotplot <- ggplot(dotplot_df, aes(y = forcats::fct_rev(y_label), x = gene_symbol, color = mean_exp, size = percent_exp)) +
  geom_point() +
  scale_color_viridis_c(option = "magma", limits = c(0,2.5), oob = scales::squish) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0.5)
  ) +
  labs(
    x = "",
    y = "Final cell type annotation"
  )

# specify legend order 
color_order <- dotplot_df |> 
  dplyr::pull(cell_type) |> 
  unique()

color_bar <- ggplot(dotplot_df, aes(x = gene_symbol, y = 1, fill = cell_type)) + 
  geom_tile() + 
  scale_fill_brewer(palette = 'Set1', breaks = color_order) +
  ggmap::theme_nothing() +
  theme(legend.position = "bottom") +
  labs(fill = "")

dotplot + color_bar +
  patchwork::plot_layout(ncol = 1, heights = c(4, 0.1)) 

A few notes from this plot:

Let’s make a heatmap version of the same plot.

# note that we can't use the same heatmap function as before since we are looking at cell types as rows rather than gene sets
# we also want to specify the annotation name

# first make a mtx to use for the heatmap with rows as cell types and genes as columns 
heatmap_mtx <- gene_summary_df |> 
  dplyr::select(gene_symbol, final_lumped, mean_exp) |> 
  tidyr::pivot_wider(
    names_from = gene_symbol,
    values_from = mean_exp
  ) |> 
  tibble::column_to_rownames("final_lumped") |>
  as.matrix()
# make sure cell types are present in the right order
heatmap_mtx <- heatmap_mtx[cell_type_order,]

# get annotation colors for each of the marker gene types 
marker_gene_categories <- unique(validation_markers_df$cell_type)
num_categories <- length(marker_gene_categories)
category_colors <- palette.colors(palette = "Dark2") |>
  head(n = num_categories) |>
  purrr::set_names(marker_gene_categories)


# create annotation for heatmap
annotation <- ComplexHeatmap::columnAnnotation(
  category = validation_markers_df$cell_type,
  col = list(
    category = category_colors
  ),
  annotation_legend_param = list(
      title = "Marker gene category",
      labels = unique(validation_markers_df$cell_type)
    )
)

ComplexHeatmap::Heatmap(
    heatmap_mtx,
    # set the color scale based on min and max values
    col = circlize::colorRamp2(seq(min(heatmap_mtx), max(heatmap_mtx), length = 2), colors = c("white", "#00274C")),
    border = TRUE,
    ## Row parameters
    cluster_rows = FALSE,
    row_title = "",
    row_title_side = "left",
    row_names_side = "left",
    row_dend_side = "right",
    row_names_gp = grid::gpar(fontsize = 10),
    ## Column parameters
    cluster_columns = FALSE,
    show_column_names = TRUE,
    column_names_gp = grid::gpar(fontsize = 8),
    top_annotation = annotation,
    heatmap_legend_param = list(
      title = "Mean expression"
    )
  )

And finally we’ll look at our annotations on a UMAP! Because, why not.

ggplot(all_info_df, aes(x = UMAP1, y = UMAP2, color = final_lumped)) +
  geom_point(alpha = 0.5, size = 0.01) +
  # set color for all remaining cell types since there are more colors than in the palette
  scale_color_manual(values = c(palette.colors(palette = "Dark2"), "black", "grey65", "grey90")) +
  labs(color = "") +
  guides(color = guide_legend(override.aes = list(alpha = 1, size = 1.5)))

plot_faceted_umap(all_info_df, final_lumped, legend_title = "Final cell type annotations") +
  theme(strip.text = element_text(size = 6))

Export cell types

Now that we have our cell types let’s export them to a TSV to save for future use!

annotation_df <- all_info_df |> 
  dplyr::mutate(
    # remove library id from barcodes since we have a library id column already 
    barcodes = stringr::word(barcodes, -1, sep = "-"),
    # assign ontology ID using consensus ontology ID
    # anything without an ID is either unknown or tumor 
    final_ontology = dplyr::if_else(
      final_annotation == consensus_annotation,
      consensus_ontology,
      final_annotation
    )
  ) |> 
  dplyr::select(
    barcodes,
    library_id,
    sample_id,
    sample_type,
    singler_ontology,
    singler_annotation,
    consensus_annotation,
    consensus_ontology,
    final_annotation,
    final_ontology
  )

readr::write_tsv(annotation_df, output_file)

Session info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.3

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Chicago
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] ggplot2_3.5.1               SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0 Biobase_2.64.0             
 [5] GenomicRanges_1.56.1        GenomeInfoDb_1.40.1         IRanges_2.38.1              S4Vectors_0.42.1           
 [9] BiocGenerics_0.50.0         MatrixGenerics_1.16.0       matrixStats_1.3.0          

loaded via a namespace (and not attached):
 [1] bitops_1.0-8              rlang_1.1.4               magrittr_2.0.3            clue_0.3-65              
 [5] GetoptLong_1.0.5          compiler_4.4.2            DelayedMatrixStats_1.26.0 png_0.1-8                
 [9] vctrs_0.6.5               stringr_1.5.1             pkgconfig_2.0.3           shape_1.4.6.1            
[13] crayon_1.5.3              fastmap_1.2.0             XVector_0.44.0            scuttle_1.14.0           
[17] labeling_0.4.3            utf8_1.2.4                rmarkdown_2.27            tzdb_0.4.0               
[21] UCSC.utils_1.0.0          purrr_1.0.2               bit_4.0.5                 xfun_0.46                
[25] zlibbioc_1.50.0           cachem_1.1.0              beachmat_2.20.0           jsonlite_1.8.8           
[29] highr_0.11                DelayedArray_0.30.1       BiocParallel_1.38.0       jpeg_0.1-10              
[33] parallel_4.4.2            cluster_2.1.6             R6_2.5.1                  bslib_0.8.0              
[37] stringi_1.8.4             RColorBrewer_1.1-3        jquerylib_0.1.4           Rcpp_1.0.14              
[41] iterators_1.0.14          knitr_1.48                readr_2.1.5               Matrix_1.7-0             
[45] tidyselect_1.2.1          abind_1.4-5               yaml_2.3.10               doParallel_1.0.17        
[49] codetools_0.2-20          plyr_1.8.9                lattice_0.22-6            tibble_3.2.1             
[53] withr_3.0.0               evaluate_0.24.0           circlize_0.4.16           pillar_1.9.0             
[57] BiocManager_1.30.23       renv_1.0.7                foreach_1.5.2             generics_0.1.3           
[61] vroom_1.6.5               rprojroot_2.0.4           hms_1.1.3                 sparseMatrixStats_1.16.0 
[65] munsell_0.5.1             scales_1.3.0              glue_1.7.0                tools_4.4.2              
[69] forcats_1.0.0             fs_1.6.4                  grid_4.4.2                tidyr_1.3.1              
[73] colorspace_2.1-1          GenomeInfoDbData_1.2.12   patchwork_1.2.0           cli_3.6.3                
[77] ggmap_4.0.0               fansi_1.0.6               S4Arrays_1.4.1            viridisLite_0.4.2        
[81] ComplexHeatmap_2.20.0     dplyr_1.1.4               gtable_0.3.5              sass_0.4.9               
[85] digest_0.6.36             SparseArray_1.4.8         rjson_0.2.21              farver_2.1.2             
[89] htmltools_0.5.8.1         lifecycle_1.0.4           httr_1.4.7                GlobalOptions_0.1.2      
[93] bit64_4.0.5              
---
title: "Cell type assignments for SCPCP000015"
author: Ally Hawkins
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: "hide"
---

This notebook looks at cell type annotations and gene set expression across all samples in `SCPCP00015` and assigns "final" annotations. 
To do this we are using the merged, but not batch-corrected, object containing the gene expression data for all samples. 

The following input is used: 

- Annotations obtained by running `SingleR` with tumor cells as a reference output by [`aucell-singler-annotation.sh`](https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-ewings/aucell-singler-annotation.sh). 
- Consensus cell type annotations output by running [`assign-consensus-celltypes.sh` in `cell-type-consensus`](https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-consensus/assign-consensus-celltypes.sh). 
- AUC values as calculated by `AUCell` for a set of Ewing sarcoma specific gene sets in MSigDB output by `run-aucell-ews-signatures.sh`. 

## Setup

```{r packages}
suppressPackageStartupMessages({
  # load required packages
  library(SingleCellExperiment)
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)

# set seed
set.seed(2024)

# quiet messages
options(readr.show_col_types = FALSE)
ComplexHeatmap::ht_opt(message = FALSE)
```


```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "results", "merge-sce", "SCPCP000015")

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-ewings") 
```

```{r}
# path to sce 
sce_file <- file.path(data_dir, "SCPCP000015_merged.rds")

# path to workflow results
workflow_results_dir <- file.path(module_base, "results")

singler_results_dir <- file.path(workflow_results_dir, "aucell_singler_annotation")
singler_results_files <- list.files(singler_results_dir, pattern = "*singler-classifications.tsv", full.names = TRUE, recursive = TRUE)
library_ids <- stringr::str_remove(basename(singler_results_files), "_singler-classifications.tsv")


aucell_results_dir <- file.path(workflow_results_dir, "aucell-ews-signatures")
aucell_results_file <- file.path(aucell_results_dir, "SCPCP000015_auc-ews-gene-signatures.tsv")

consensus_results_dir <- file.path(repository_base, "analyses", "cell-type-consensus", "results", "cell-type-consensus", "SCPCP000015")
consensus_results_files <- list.files(consensus_results_dir, pattern = "*_consensus-cell-type-assignments.tsv.gz", full.names = TRUE, recursive = TRUE)

# small gene sets
visser_marker_genes_file <- file.path(module_base, "references", "visser-all-marker-genes.tsv")
cell_state_genes_file <- file.path(module_base, "references", "tumor-cell-state-markers.tsv")

# marker genes to be used for validating assignments 
validation_markers_file <- file.path(module_base, "references", "combined-validation-markers.tsv")
```

```{r}
# output file to save final annotations 
results_dir <- file.path(module_base, "results", "final-annotations")
fs::dir_create(results_dir)
output_file <- file.path(results_dir, "SCPCP000015_celltype-annotations.tsv.gz")
```


```{r}
# source in setup functions prep_results()
setup_functions <- file.path(module_base, "template_notebooks", "utils", "setup-functions.R")
source(setup_functions)

# source in validation functions 
# calculate_mean_markers(), plot_faceted_umap()
validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
source(validation_functions)

# source in plotting functions 
# expression_umap(), cluster_density_plot(), and annotated_exp_heatmap()
plotting_functions <- file.path(module_base, "template_notebooks", "utils", "plotting-functions.R")
source(plotting_functions)

# source jaccard functions plot_jaccard()
jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
source(jaccard_functions)
```

```{r}
stopifnot(
  "sce file does not exist" = file.exists(sce_file),
  "aucell results file does not exist" = file.exists(aucell_results_file), 
  "at least one singler file is missing" = all(file.exists(singler_results_files)),
  "at least one consensus file is missing" = all(file.exists(consensus_results_files))
)
```


```{r, message=FALSE}
# read in sce
sce <- readr::read_rds(sce_file)

# read in workflow results
singler_df <- singler_results_files |> 
  purrr::set_names(library_ids) |>
  purrr::map(readr::read_tsv) |> 
  dplyr::bind_rows(.id = "library_id")

aucell_df <- readr::read_tsv(aucell_results_file) |> 
  tidyr::separate(barcodes, "-", into = c("library_id", "barcodes"))

# consensus cell types 
consensus_df <- consensus_results_files |> 
  purrr::map(readr::read_tsv) |> 
  dplyr::bind_rows()

# read in marker genes and combine into one list 
visser_markers_df <- readr::read_tsv(visser_marker_genes_file) |> 
  dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> 
  unique()
  
cell_state_markers_df <- readr::read_tsv(cell_state_genes_file) |> 
  dplyr::select(cell_type = cell_state, ensembl_gene_id, gene_symbol)

all_markers_df <- dplyr::bind_rows(list(visser_markers_df, cell_state_markers_df))
```

## Prepare data for plotting

```{r}
all_results_df <- prep_results(
  sce, 
  singler_df = singler_df, 
  cluster_df = NULL, 
  aucell_df = aucell_df,
  consensus_df = consensus_df,
  cluster_nn = params$cluster_nn,
  cluster_res = params$cluster_res,
  join_columns = c("barcodes", "library_id")
  ) |>
  dplyr::mutate(barcodes = glue::glue("{library_id}-{barcodes}"))
  
cell_types <- unique(all_markers_df$cell_type)

# get the mean expression of all genes for each cell state
gene_exp_df <- cell_types |>
  purrr::map(\(type){
    calculate_mean_markers(all_markers_df, sce, type, cell_type)
  }) |>
  purrr::reduce(dplyr::inner_join, by = "barcodes")

all_info_df <- all_results_df |> 
  dplyr::left_join(gene_exp_df, by = "barcodes") |> 
  dplyr::mutate(
    singler_annotation = dplyr::if_else(is.na(singler_annotation), "unknown", singler_annotation)
  )
```

## Combine classifications from `SingleR` and consensus cell types 

The first thing we will do is compare cell type annotations obtained by each method, `SingleR` with tumor cells as a reference (output by [`aucell-singler-annotation.sh`](https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-ewings/aucell-singler-annotation.sh)) and consensus annotations (output from [`assign-consensus-celltypes.sh` in `cell-type-consensus`](https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-consensus/assign-consensus-celltypes.sh).  

Let's see how similar these annotations are and see if we can create a combined annotation. 
Note that `SingleR` will label tumor cells but the consensus cell types only label normal cells. 
Consensus cell types are observed when cell types from `SingleR` and `CellAssign` (using only normal tissue references) share a common ancestor. 
If no consensus is found, the cells are labeled with "Unknown". 

Any cells that are unable to be labeled via consensus cell types are labeled as "Unknown" and I expect these will line up with cells labeled as tumor cells by `SingleR`.  

```{r, fig.height=5}
# get cell types that have at least 50 cells
singler_to_keep <- all_info_df |> 
  dplyr::count(singler_annotation) |>
  dplyr::filter(n >= 50) |> 
  dplyr::pull(singler_annotation)

# first get jaccard index between consensus and singler
jaccard_mtx <- all_info_df |> 
  dplyr::filter(singler_annotation %in% singler_to_keep) |> 
  make_jaccard_matrix(
    "consensus_annotation",
    "singler_annotation"
  )

ComplexHeatmap::Heatmap(
  t(jaccard_mtx),
  col = circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue")),
  border = TRUE,
  ## Row parameters
  cluster_rows = TRUE,
  row_title = "SingleR with tumor reference",
  row_title_gp = grid::gpar(fontsize = 12),
  row_title_side = "left",
  row_names_side = "left",
  row_dend_side = "right",
  row_names_gp = grid::gpar(fontsize = 12),
  ## Column parameters
  cluster_columns = TRUE,
  column_title = "Consensus",
  column_title_gp = grid::gpar(fontsize = 12),
  column_names_side = "bottom",
  column_names_gp = grid::gpar(fontsize = 12),
  column_names_rot = 90,
  ## Legend parameters
  heatmap_legend_param = list(
    title = "Jaccard index",
    direction = "vertical",
    legend_width = unit(1.5, "in")
  ))
```

It looks like generally these two cell type annotations are in agreement. 
We also see that most of the "Unknown" cells as denoted by consensus annotations are in fact lining up with tumor cells. It looks like some tumor cells labeled by `SingleR` are labeled as "smooth muscle cells" in the consensus cell types, so that's something to keep in mind. 

Let's create a combined annotation that labels cells as tumor if the cell type label is tumor in `SingleR` and Unknown in consensus labels. 
All other labels will be taken from the consensus cell types. 

```{r}
# classify based on combined consensus and singler annotation 
all_info_df <- all_info_df |> 
  dplyr::mutate(combined_annotation = dplyr::if_else(
    consensus_annotation == "Unknown" & singler_annotation == "tumor",
    "tumor",
    consensus_annotation
  ),
  # lump for plotting later 
  combined_lumped = combined_annotation |>
    forcats::fct_lump_n(9, other_level = "All remaining cell types", ties.method = "first") |>
    forcats::fct_infreq() |> 
    forcats::fct_relevel("All remaining cell types", after = Inf)
  )

all_info_df |> 
  dplyr::count(combined_annotation) |> 
  dplyr::arrange(desc(n))
```


## Use `AUCell` and mean gene set scores to refine tumor cells and identify tumor cell states 

We used `AUCell` with gene sets that we expect to be up and down regulated in tumor cells. 
We can look at the AUC values for these gene sets across all the cell types and see if there are any cells that should be classified as tumor that are not. 
Additionally, we have gene sets that can help us distinguish EWS high and EWS low tumor cells. 

First we'll look at the mean AUC value for each gene set across all cell types. 

Below is some information for the gene sets:  

- Any gene sets labeled with `up` represent `EWS-FLI1` target genes that we expect to be upregulated in `EWS-FLI1` high tumor cells. 
- Any gene sets labeled with `down` represent `EWS-FLI1` repressed genes that we expect to be downregulated in `EWS-FLI1` high tumor cells and upregulated in `EWS-FLI1` low tumor cells. 
- The `aynaud-ews-targets` is a list of genes found to be upregulated in `EWS-FLI1` high tumor cells. 
- The `wrenn-nt5e-genes` is a list of genes found to be upregulated in `EWS-FLI1` low tumor cells/ cancer associated fibroblasts.
- The `gobp_ECM` and `hallmark_EMT` are gene sets that are hypothesized to be upregulated in `EWS-FLI1` low tumor cells/ cancer associated fibroblasts.

```{r}
# reformat auc data for plots
auc_df <- all_info_df |> 
  tidyr::pivot_longer(starts_with("auc_"), names_to = "geneset", values_to = "auc") |> 
  dplyr::mutate(
    geneset = stringr::str_remove(geneset, "auc_")
  ) 

# create mtx with mean auc for heatmap 
auc_mtx <- auc_df |> 
  dplyr::group_by(combined_annotation, geneset) |> 
  dplyr::summarize(
    mean_auc = mean(auc)
  ) |> 
  tidyr::pivot_wider(names_from = geneset, 
                     values_from = mean_auc) |> 
  tidyr::drop_na() |> 
  tibble::column_to_rownames("combined_annotation") |>
  as.matrix()

ComplexHeatmap::Heatmap(
    auc_mtx,
    # set the color scale based on min and max values
    col = circlize::colorRamp2(seq(min(auc_mtx), max(auc_mtx), length = 2), colors = c("white", "#00274C")),
    border = TRUE,
    ## Row parameters
    cluster_rows = TRUE,
    row_title = "Combined cell types",
    row_title_side = "left",
    row_names_side = "left",
    row_dend_side = "right",
    row_names_gp = grid::gpar(fontsize = 10),
    ## Column parameters
    column_title = "MSigDB gene set",
    cluster_columns = TRUE,
    show_column_names = TRUE,
    column_names_gp = grid::gpar(fontsize = 8),
    heatmap_legend_param = list(
      title = "Mean expression"
    )
  )

```

- Here it looks like the tumor cells have high expression of `EWS-FLI1` upregulated gene sets and low expression of `EWS-FLI1` repressed targets. 
This suggests these cells are `EWS-high`. 
- This same pattern is seen in cells labeled as "smooth muscle cell" and "stromal cell". 
There may even be some slight expression of `EWS-FLI1` upregulated gene sets in "muscle cell", "Unknown", and some of the T cell populations. 
Some of these cells are probably mistakenly labeled and should be tumor cells that are "EWS-high".
- Fibroblasts and chondroctyes have low expression of `EWS-FLI1` upregulated gene sets and high expression of `EWS-FLI1` repressed targets. 
Many of the genes in the `EWS-FLI1` repressed targets are also genes upregulated in fibroblasts so I'm sure we have a mix of tumor and fibroblasts in there.

Because we have some normal cell types that are showing gene signatures expected to be specific to tumor cells, we should use the AUC values to help pull out any remaining cells that should be tumor cells. 
We can also use the up and down gene sets to distinguish EWS-high and EWS-low tumor cells. 

Let's pick some gene sets where we see variation across cell types that we can use to define cutoffs: 

- EWS-high gene sets: `aynaud-ews-targets` and `staege`
- EWS-low gene sets: `wrenn-nt5e-genes` and `hallmark_EMT`

```{r}
genesets_of_interest <- c(
  "auc_aynaud-ews-targets",
  "auc_staege",
  "auc_wrenn-nt5e-genes",
  "auc_hallmark_EMT"
)

cluster_density_plot(all_info_df, genesets_of_interest, "combined_lumped", "AUC")
```

We can use these plots to help define cutoffs of AUC values to use for re-labeling any normal cells that should be tumor cells. 
We expect that the endothelial cells and macrophages have low AUC values so we can use these cell types as the background. 
Any AUC values above that background should be classified as tumor. 

With this in mind, let's use these cutoffs to define tumor cells:

- `aynaud-ews-targets` > 0.04 AUC
- `staege` > 0.01 AUC
- `wrenn-nt5e-genes` > 0.1 AUC
- `hallmark_EMT` > 0.05 AUC 

Let's re-plot the density plots adding in a line for the AUC cutoffs we have chosen. 

```{r}
cutoffs = c(
  "auc_aynaud-ews-targets" = 0.04,
  "auc_staege" = 0.01,
  "auc_wrenn-nt5e-genes" = 0.1,
  "auc_hallmark_EMT" = 0.05
)


cutoffs |>
    purrr::imap(\(cutoff, column){
      plot_density(
        all_info_df,
        column,
        "combined_lumped"
      ) +
        theme(text = element_text(size = 8)) +
        geom_vline(xintercept = cutoff)
    }) |>
    patchwork::wrap_plots(ncol = 2)
```

I don't think these cutoffs are perfect, but I think they are reasonable. 
Any cells that are to the right of these cutoffs will be classified as tumor. 

We can also use these to further stratify tumor cells into EWS high and EWS low classes. 

- `tumor EWS-low`: High expression of `wrenn-nt5e-genes` (> 0.1 AUC) and EMT markers `hallmark_EMT` (> 0.05 AUC)
- `tumor EWS-high`: High expression of `aynaud-ews-targets` (>0.04 AUC) and `staege` (>0.01 AUC)

```{r}
# top cell type order 
density_plot_order <- c(
  "tumor EWS-high",
  "tumor EWS-low", 
  "Unknown",
  "smooth muscle cell",
  "endothelial cell",
  "macrophage",
  "fibroblast",
  "muscle cell",
  "mature T cell",
  "All remaining cell types"
)

# define "final.final" cell types 
all_info_df <- all_info_df |> 
  dplyr::mutate(
    final_annotation = dplyr::case_when(
      `auc_wrenn-nt5e-genes` > 0.1 & auc_hallmark_EMT > 0.05 ~ "tumor EWS-low",
      `auc_aynaud-ews-targets` > 0.04 & auc_staege > 0.01 ~ "tumor EWS-high",
      .default = consensus_annotation
    ),
    # lump together for easier plotting
    final_lumped = final_annotation |>
      forcats::fct_lump_n(9, other_level = "All remaining cell types", ties.method = "first") |>
      forcats::fct_infreq() |>
      forcats::fct_relevel(density_plot_order)
  )

all_info_df |> 
  dplyr::count(final_annotation) |> 
  dplyr::arrange(desc(n))
```

Let's remake the density plot to see if we have pulled out the tumor cells from the normal cell types like muscle cells. 

```{r}
cluster_density_plot(all_info_df, genesets_of_interest, "final_lumped", "AUC")
```

It looks like generally the `tumor EWS-high` cells show high expression of both upregulated gene sets and `tumor EWS-low` cells show high expression of both downregulated gene sets, so I feel good about this! 

I will also note that there are some muscle cells that show slight expression of `aynaud-ews-targets`, but show little to no expression of `staege` targets. 
I'm still a little worried these are actually tumor cells, but because they only show expression of one of the genesets, we are probably okay to keep them as is noting that the expression profile of muscle cells is very similar to those of tumor cells, this being sarcoma. 

## Identifying proliferative cells

The last tumor cell state that we want to identify is if any cells can be considered proliferative using the set of proliferative markers in `tumor-cell-state-markers.tsv`. 
Let's look at the mean expression of these markers across all cells. 
Since we are plotting the proliferative markers, let's also look at our other custom marker gene sets. 

```{r}
# get columns with mean expression of custom gene sets 
mean_exp_columns <- colnames(all_info_df)[which(endsWith(colnames(all_info_df), "_mean"))]
# make sure order is defined by cell type order
all_info_df <- all_info_df |> 
  dplyr::arrange(final_lumped)

single_annotation_heatmap(
  all_info_df, 
  exp_columns = mean_exp_columns, 
  cell_type_column = "final_lumped", 
  legend_title = "AUC"
)
```

It looks like we do have a set of cells that are `tumor EWS-high` and also have high expression of proliferative markers. 
Because of this, let's label any cells that are `tumor EWS-high` and have mean expression of proliferative markers > 0 as `tumor EWS-high proliferative`. 

```{r}
# set desired order for plotting final annotations later
cell_type_order <- c(
  "tumor EWS-high",
  "tumor EWS-high proliferative",
  "tumor EWS-low",
  "muscle cell", 
  "smooth muscle cell",
  "fibroblast",
  "endothelial cell",
  "macrophage",
  "mature T cell",
  "Unknown",
  "All remaining cell types"
)

all_info_df <- all_info_df |> 
  dplyr::mutate(
    final_annotation = dplyr::if_else(
      final_annotation == "tumor EWS-high" & proliferative_mean > 0,
      "tumor EWS-high proliferative",
      final_annotation
    ), 
    final_lumped = final_annotation |>
      forcats::fct_lump_n(10, other_level = "All remaining cell types", ties.method = "first") |>
      forcats::fct_infreq() |>
      forcats::fct_relevel(cell_type_order)
  )

# first get tally for how many libraries each cell type is present in 
total_libraries <- all_info_df |> 
  dplyr::select(final_annotation, library_id) |> 
  unique() |> # get rid of duplicate cells
  dplyr::group_by(final_annotation) |> 
  dplyr::summarize(
    total_libraries = length(library_id)) |> 
  dplyr::arrange(desc(total_libraries))

# print out both total cells and total libraries 
all_info_df |> 
  dplyr::count(final_annotation, name = "total_cells") |>
  dplyr::left_join(total_libraries, by = "final_annotation") |> 
  dplyr::arrange(desc(total_cells))
```


For plotting purposes, we will only show the top 10 cell types with muscle cell being the last cell type. 
It looks like all other cell types are only present in 1-3 libraries. 
For any cells where there's only 1 cell we may want to confirm that those cells express the expected targets, but that is outside the scope of this notebook. 

## Validation of cell type assignments

In the following section we'll show some plots to help validate that we are content with our cell type assignments. 
These plots will look specifically at expression of a set of key marker genes that we expect to up in the assigned cell types. 
These markers are found in `references/combined-markers.tsv` and include a smaller set of markers for each of the cell types we have identified. 

```{r}
# read in file with validation markers 
validation_markers_df <- readr::read_tsv(validation_markers_file)

# pull out list of genes 
genes <- validation_markers_df |>
  dplyr::pull(ensembl_gene_id)

# get individual gene counts for all marker genes 
gene_cts <- logcounts(sce[genes, ]) |>
  as.matrix() |>
  t() |> 
  as.data.frame()
gene_cts$barcodes <- rownames(gene_cts)

# get all unique cell types from the final lumped group 
celltypes_df <- all_info_df |> 
  dplyr::select(barcodes, final_lumped)

# create a df that has gene expression column and column indicating whether or not the gene is detected in that cell 
genes_df <- gene_cts |> 
  tidyr::pivot_longer(!barcodes, names_to = "ensembl_gene_id", values_to = "gene_exp") |> 
  dplyr::left_join(celltypes_df, by = c("barcodes")) |> 
  dplyr::left_join(validation_markers_df, by = c("ensembl_gene_id")) |> 
  dplyr::rowwise() |> 
  dplyr::mutate(
    detected = gene_exp > 0 # column indicating if gene is present or not
  )

# get total number of cells per final annotation group 
total_cells_df <- genes_df |> 
  dplyr::select(barcodes, final_lumped) |> 
  unique() |> 
  dplyr::count(final_lumped, name = "total_cells")

# get total number of cells each gene is detected in per group 
# and mean gene expression per group 
group_stats_df <- genes_df |> 
  dplyr::group_by(final_lumped, ensembl_gene_id) |>
  dplyr::summarize(
    detected_count = sum(detected),
    mean_exp = mean(gene_exp)
  )

gene_summary_df <- genes_df |> 
  # add total cells
  dplyr::left_join(total_cells_df, by = c("final_lumped")) |> 
  # add per gene stats
  dplyr::left_join(group_stats_df, by = c("ensembl_gene_id", "final_lumped")) |> 
  dplyr::select(gene_symbol, final_lumped, cell_type, total_cells, detected_count, mean_exp) |> 
  unique() |>
  dplyr::rowwise() |> 
  dplyr::mutate(
    # get total percent
    percent_exp = (detected_count/total_cells) * 100,
    # order genes based on cell type they indicate
    gene_symbol = factor(gene_symbol, levels = validation_markers_df$gene_symbol),
    final_lumped = forcats::fct_relevel(final_lumped, cell_type_order)
    
  ) 
```

```{r, fig.height=7, fig.width=10}
# filter out low expressed genes
dotplot_df <- gene_summary_df |> 
  dplyr::filter(mean_exp > 0, percent_exp > 10) |>
  dplyr::arrange(final_lumped) |> 
  dplyr::mutate(y_label = as.factor(glue::glue("{final_lumped} ({total_cells})")))


dotplot <- ggplot(dotplot_df, aes(y = forcats::fct_rev(y_label), x = gene_symbol, color = mean_exp, size = percent_exp)) +
  geom_point() +
  scale_color_viridis_c(option = "magma", limits = c(0,2.5), oob = scales::squish) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0.5)
  ) +
  labs(
    x = "",
    y = "Final cell type annotation"
  )

# specify legend order 
color_order <- dotplot_df |> 
  dplyr::pull(cell_type) |> 
  unique()

color_bar <- ggplot(dotplot_df, aes(x = gene_symbol, y = 1, fill = cell_type)) + 
  geom_tile() + 
  scale_fill_brewer(palette = 'Set1', breaks = color_order) +
  ggmap::theme_nothing() +
  theme(legend.position = "bottom") +
  labs(fill = "")

dotplot + color_bar +
  patchwork::plot_layout(ncol = 1, heights = c(4, 0.1)) 
```

A few notes from this plot: 

- Generally tumor markers are present throughout, although they are highest in tumor cells. 
- EWS-high proliferative cells show strong expression of proliferation markers. 
- Tumor EWS-low cells show pretty strong expression of the `EWS-low` markers. 
These markers are also present in fibroblasts, but to a lesser degree. 
Additionally, `TNC`, which has shown to be secreted by Ewing cells and important in the metastatic phenotype of Ewing sarcoma (https://doi.org/10.1016/j.neo.2019.08.007), is specific to the low cells and not found in the fibroblasts, which makes me more confident that those are indeed tumor cells. 
- Endothelial cells show strong expression of `PECAM1` and `VWF`, while that is not seen in most of the other cells. 
- All immune cell types show `PTPRC` as expected with macrophages also showing `MRC1` and T cells showing expression of `CD3G`

Let's make a heatmap version of the same plot. 

```{r, fig.width=10}
# note that we can't use the same heatmap function as before since we are looking at cell types as rows rather than gene sets
# we also want to specify the annotation name

# first make a mtx to use for the heatmap with rows as cell types and genes as columns 
heatmap_mtx <- gene_summary_df |> 
  dplyr::select(gene_symbol, final_lumped, mean_exp) |> 
  tidyr::pivot_wider(
    names_from = gene_symbol,
    values_from = mean_exp
  ) |> 
  tibble::column_to_rownames("final_lumped") |>
  as.matrix()
# make sure cell types are present in the right order
heatmap_mtx <- heatmap_mtx[cell_type_order,]

# get annotation colors for each of the marker gene types 
marker_gene_categories <- unique(validation_markers_df$cell_type)
num_categories <- length(marker_gene_categories)
category_colors <- palette.colors(palette = "Dark2") |>
  head(n = num_categories) |>
  purrr::set_names(marker_gene_categories)


# create annotation for heatmap
annotation <- ComplexHeatmap::columnAnnotation(
  category = validation_markers_df$cell_type,
  col = list(
    category = category_colors
  ),
  annotation_legend_param = list(
      title = "Marker gene category",
      labels = unique(validation_markers_df$cell_type)
    )
)

ComplexHeatmap::Heatmap(
    heatmap_mtx,
    # set the color scale based on min and max values
    col = circlize::colorRamp2(seq(min(heatmap_mtx), max(heatmap_mtx), length = 2), colors = c("white", "#00274C")),
    border = TRUE,
    ## Row parameters
    cluster_rows = FALSE,
    row_title = "",
    row_title_side = "left",
    row_names_side = "left",
    row_dend_side = "right",
    row_names_gp = grid::gpar(fontsize = 10),
    ## Column parameters
    cluster_columns = FALSE,
    show_column_names = TRUE,
    column_names_gp = grid::gpar(fontsize = 8),
    top_annotation = annotation,
    heatmap_legend_param = list(
      title = "Mean expression"
    )
  )
```


And finally we'll look at our annotations on a UMAP! 
Because, why not. 

```{r}
ggplot(all_info_df, aes(x = UMAP1, y = UMAP2, color = final_lumped)) +
  geom_point(alpha = 0.5, size = 0.01) +
  # set color for all remaining cell types since there are more colors than in the palette
  scale_color_manual(values = c(palette.colors(palette = "Dark2"), "black", "grey65", "grey90")) +
  labs(color = "") +
  guides(color = guide_legend(override.aes = list(alpha = 1, size = 1.5)))
```

```{r, fig.height=7}
plot_faceted_umap(all_info_df, final_lumped, legend_title = "Final cell type annotations") +
  theme(strip.text = element_text(size = 6))
```


## Export cell types 

Now that we have our cell types let's export them to a TSV to save for future use! 

```{r}
annotation_df <- all_info_df |> 
  dplyr::mutate(
    # remove library id from barcodes since we have a library id column already 
    barcodes = stringr::word(barcodes, -1, sep = "-"),
    # assign ontology ID using consensus ontology ID
    # anything without an ID is either unknown or tumor 
    final_ontology = dplyr::if_else(
      final_annotation == consensus_annotation,
      consensus_ontology,
      final_annotation
    )
  ) |> 
  dplyr::select(
    barcodes,
    library_id,
    sample_id,
    sample_type,
    singler_ontology,
    singler_annotation,
    consensus_annotation,
    consensus_ontology,
    final_annotation,
    final_ontology
  )

readr::write_tsv(annotation_df, output_file)
```


## Session info 

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```

