CODEX data analysis


Andre Forjaz
I'm a third year PhD candidate in the Wirtz Lab. My research focuses on generating 3D reconstructions of anatomical tissues with single-cell resolution to study disease progression.

CODEX data analysis
Genes extracted from cluster analysis

In the spleen data analyzed, the upregulated gene expressed in cluster 4 was CD8. On the other k-means cluster 6 analyzed, the upregulated genes were colagenIV, ECAD, CD21, and Podoplanin.

Options listed as potential tissues
  • Arteries/Veins
  • White pulp
  • Red pulp
  • Capsule/Trabeculae
Clusters data matched with respective tissue component

In summary, the CD8+ T-cell cluster 4 suggests white pulp.[1]

On the other hand, cluster 6 expresses the genes collagen IV, E-cadherin, CD21, and Podoplanin, which indicate cell types found in the blood vessels.[3]

So the data is most consistent with analysis of the white pulp and blood vessels.

References

1.Cesta MF. Normal Structure, Function, and Histology of the Spleen. Toxicologic Pathology. 2006;34(5):455-465. <doi:10.1080/01926230600867743>

2.https://teachmephysiology.com/gastrointestinal-system/other/function-of-spleen/

3.Chi, Jen-Tsan Ashley et al. “Endothelial cell diversity revealed by global expression profiling.” Proceedings of the National Academy of Sciences of the United States of America 100 (2003): 10623 - 10628. https://biostatsquid.com/volcano-plots-r-tutorial/ https://cran.r-project.org/web/packages/gridExtra/vignettes/arrangeGrob.html

#### author #### 
# Andre Forjaz
# jhed: aperei13

#### load packages #### 
library(Rtsne)
library(ggplot2)
library(patchwork)
library(gridExtra)
library(umap)
library(ggrepel)
library(tidyverse)
library(RColorBrewer)

#### Input ####
outpth <-'~/genomic-data-visualization-2024/homework/hw6/'
pth <-'~/genomic-data-visualization-2024/data/codex_spleen_subset.csv.gz'
data <- read.csv(pth, row.names=1)
# pick color palette
colpal <- c("#365359", "#91D2D9", "#c8d5e1","#D4F5DD","#a8a8a8","#b784a7","#967bb6","#8e4585")


# data preview
data[1:5,1:8]

# xy data 
pos <- data[, 1:2]
# area 
area <- data[, 3]
# gene data 
pexp <- data[4:ncol(data)]
head(pexp)

# normalize the data with log10 mean
pexpnorm <- log10(pexp/rowSums(pexp) * mean(rowSums(pexp))+1)

rownames(pos) <- rownames(pexpnorm) <- data$cell_id
head(pos)
head(pexpnorm)

# PCA
pcs <- prcomp(pexpnorm)
plot(pcs$sdev[1:30])

## PANEL A
## scree plot pca
df_a <- data.frame(standard_deviation = pcs$sdev[1:30],
                   PCA_index =1:30)

pa <- ggplot(df_a) + 
  geom_point(aes(x=PCA_index,y= standard_deviation),size = 2, color = colpal[3]) +
  labs(x = "PCA Index",y = "Standard deviation")+ 
  ggtitle("PCA scree plot")+
  theme_classic()
pa

# tsne on pcs
emb <- Rtsne::Rtsne(pcs$x)$Y
plot(emb)

## kmeans clustering
tw <- sapply(1:20, function(i) {
  kmeans(emb, centers=i)$tot.withinss
})
plot(tw, type='l')

## PANEL B
## total withiness kmeans clustering
df_b <- data.frame(total_withiness = tw,
                   cluster_index =1:20)

pb <- ggplot(df_b) + 
  geom_point(aes(x=cluster_index,y= total_withiness),size = 2, color = colpal[3]) +
  labs(x = "Cluster index",y = "Total withiness")+ 
  ggtitle("K-means total withiness")+
  theme_classic()
pb

# identify clusters kmeans
com <- as.factor(kmeans(emb, centers = 8)$cluster)

## Panel C
# tsne visualization 
df_c <- data.frame(V1 = emb[,1],
                   V2 = emb[,2],
                   cluster = com)

pc <- ggplot(df_c) + 
  geom_point(aes(V1, V2, color=cluster),size = 2) +
  scale_color_manual(values = colpal) +
  labs(x = "tSNE1",y = "tSNE2")+  
  ggtitle("t-SNE with k-means clustering of gene expression")+
  theme_classic()

pc

## Panel D
##################### spatial resolved clusters
df_d <- data.frame(x = pos$x,
                   y = pos$y,
                   cluster = com)

pd <- ggplot(df_d) + 
  geom_point(aes(x, y,
                 color= factor(cluster)),
             size =0.5,
             alpha = 0.8) +
  scale_color_manual(values = colpal, guide = guide_legend(title = NULL))+ 
  ggtitle("Spatial distribution of all clusters")+
  theme_classic()
pd

## Panel E
# spatial resolved cluster 6
cluster_kmeans <-6
pe <- ggplot(df_d) +
  geom_point(aes(x, y,
                 color= ifelse(cluster == cluster_kmeans, paste("Cluster ",cluster_kmeans), "Other")),
             size =0.5,
             alpha = 0.8) +
  scale_color_manual(values = colpal, guide = guide_legend(title = NULL))+
  ggtitle(paste("Spatial distribution of cluster", cluster_kmeans))+
  theme_classic()
pe

## Panel F
# differentially expressed genes for your cluster of interest
pv <- sapply(colnames(pexpnorm), function(i) {
  # print(i) ## print out gene name
  wilcox.test(pexpnorm[com == cluster_kmeans, i], pexpnorm[com != cluster_kmeans, i])$p.val
})

logfc <- sapply(colnames(pexpnorm), function(i) {
  # print(i) ## print out gene name
  log2(mean(pexpnorm[com == cluster_kmeans, i])/mean(pexpnorm[com != cluster_kmeans, i]))
})

# volcano plot
df_f <- data.frame(pv=pv, logfc, gene_symbol=colnames(pexpnorm))
# Credit: https://biostatsquid.com/volcano-plots-r-tutorial/
df_f$diffexpressed <- "NO"
# Set as "UP" if log2Foldchange > 0.6 and pvalue < 0.05
df_f$diffexpressed[df_f$logfc > 0.6 & df_f$pv < 0.05] <- "UP"
# Set as "DOWN" if log2Foldchange < -0.6 and pvalue < 0.05
df_f$diffexpressed[df_f$logfc < -0.6 & df_f$pv <0.05] <- "DOWN"
# df_f$delabel <- NA
df_f$delabel <- ifelse(df_f$gene_symbol %in% head(df_f[order(df_f$pv),"gene_symbol"],50),df_f$gene_symbol, NA)

# Explore the data
head(df_f[order(df_f$pv) & df_f$diffexpressed == 'UP', ])


pf <- ggplot(df_f, aes(x = logfc, y = -log10(pv+1e-20), col = diffexpressed, label = delabel)) +
  geom_point(aes(x = logfc, y = -log10(pv+1e-20))) +
  geom_vline(xintercept = c(-0.6, 0.6), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
  # scale_color_manual(values = c( "grey", "#FFDB6D"),
  #                    labels = c( "Not significant", "Upregulated"))+
  scale_color_manual(values = c("#00AFBB", "grey", "#FFDB6D"),
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  labs(color = 'Gene expression',
       x = expression("log"[2]*"FC"), y = expression("-log"[10]*"p-value")) + 
  scale_x_continuous(breaks = seq(-10, 10, 2)) +
  theme_classic() +
  geom_label_repel(data = subset(df_f, diffexpressed != "NO"), 
                  aes(label = delabel),
                  box.padding = 0.5,
                  point.padding = 0.5,
                  show.legend = FALSE,
                  max.overlaps = 40)
pf

## Panel G
# spatial resolved cluster 4
cluster_kmeansg <-4
pg <- ggplot(df_d) +
  geom_point(aes(x, y,
                 color= ifelse(cluster == cluster_kmeansg, paste("Cluster ",cluster_kmeansg), "Other")),
             size =0.5,
             alpha = 0.8) +
  scale_color_manual(values = colpal, guide = guide_legend(title = NULL))+
  ggtitle(paste("Spatial distribution of cluster", cluster_kmeansg))+
  theme_classic()
pg

## Panel H
# differentially expressed genes for your cluster of interest
pv2 <- sapply(colnames(pexpnorm), function(i) {
  # print(i) ## print out gene name
  wilcox.test(pexpnorm[com == cluster_kmeansg, i], pexpnorm[com != cluster_kmeansg, i])$p.val
})

logfc2 <- sapply(colnames(pexpnorm), function(i) {
  # print(i) ## print out gene name
  log2(mean(pexpnorm[com == cluster_kmeansg, i])/mean(pexpnorm[com != cluster_kmeansg, i]))
})

# volcano plot
df_h <- data.frame(pv2=pv2, logfc2, gene_symbol=colnames(pexpnorm))
# Credit: https://biostatsquid.com/volcano-plots-r-tutorial/
df_h$diffexpressed <- "NO"
# Set as "UP" if log2Foldchange > 0.6 and pvalue < 0.05
df_h$diffexpressed[df_h$logfc2 > 0.6 & df_h$pv2 < 0.05] <- "UP"
# Set as "DOWN" if log2Foldchange < -0.6 and pvalue < 0.05
df_h$diffexpressed[df_h$logfc2 < -0.6 & df_h$pv2 <0.05] <- "DOWN"
# df_h$delabel <- NA
df_h$delabel <- ifelse(df_h$gene_symbol %in% head(df_h[order(df_h$pv2),"gene_symbol"],50),df_h$gene_symbol, NA)

# Explore the data
head(df_h[order(df_h$pv2) & df_h$diffexpressed == 'UP', ])


ph <- ggplot(df_h, aes(x = logfc2, y = -log10(pv2+1e-20), col = diffexpressed, label = delabel)) +
  geom_point(aes(x = logfc2, y = -log10(pv2+1e-20))) +
  geom_vline(xintercept = c(-0.6, 0.6), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
  scale_color_manual(values = c( "grey", "#FFDB6D"),
                     labels = c( "Not significant", "Upregulated"))+
  # scale_color_manual(values = c("#00AFBB", "grey", "#FFDB6D"),
  #                    labels = c("Downregulated", "Not significant", "Upregulated")) +
  labs(color = 'Gene expression',
       x = expression("log"[2]*"FC"), y = expression("-log"[10]*"p-value")) + 
  scale_x_continuous(breaks = seq(-10, 10, 2)) +
  theme_classic() +
  geom_label_repel(data = subset(df_h, diffexpressed != "NO"), 
                   aes(label = delabel),
                   box.padding = 0.25,
                   point.padding = 0.25,
                   show.legend = FALSE,
                   max.overlaps = 40)
ph


#### Plot results ####

# combined_plot <-grid.arrange( pa , pb , pc, pd , pe , pf , pg, ph , layout_matrix = lay)
# combined_plot <-(pa + pb + pc+ pd) / ( pe + pf+pg+ ph)  + plot_annotation(tag_levels = 'A')
combined_plot <-(pa + pb + pc) / ( pd+ pe + pf)/(pg+ ph)  + plot_annotation(tag_levels = 'A')

#### Save results ####
ggsave(paste0(outpth, "hw6_aperei13.png"), 
       plot = combined_plot,
       width = 12,
       height = 12,
       units = "in")