CODEX - Spleen White Pulp

Hello, this is Harriet! I’m a first-year BME master's student. Above are my friend’s cat (a Ragdoll on the left) and my cat, Ananas (a Maine Coon on the right). Ananas means 'pineapple' in French—he’s an 18lb cat who loves to eat and sleep. I am currently conducting bioinformatics and spatial transcriptomics research in both the Adamo Lab and Umapathi Lab, focusing on Myocarditis and OGT/OGA.

### I think the CODEX dataset represents splenic white pulp. With the available dataset, I have performed log transform, removal of extra data point, K-mean clustering. With those preliminary steps done, I moved on to k=5 clusters and visualize these cluster in PCA, TSNE and spatial physical space. These plots shown distinctive localized cluster, but there are also some areas that are relatively separate out. I was done the wilcoxon two sided test on 5 different clusters and calculate the Log2FC and p-value for genes inside of each cluster to see their significance. With the result got, I visualized them with volcano plot and saw what ever is upregulated and downregulated. Looking at Cluster 2 with key markers of CD3e, CD4 and CD45RO, these are T cells which is the characteristic of the lymphoid tissue in white pulp (Van Krieken, J.H., & te Velde, J.,1986). Looking at cluster 4, the key gene markers are CD21, CD35, and CD20. “These are markers for follicular dendritic cells and B-cells. In mice and rats, the white pulp is composed of Tcell zones, the periarterial lymphatic sheaths (PALSs), and B-cell zones, the follicles and the marginal zone (MZ).” (Steiniger, B.S., 2015). Moreover, Cluster 3 was found to express CD68 and CD163, a marker for marginal zone macrophages that constitute the border of white pulp (Steiniger, B.S., 2015). The distinct cellular populations of the clearly spatially organized tissue in this sample strongly implicate that it is splenic white pulp.

Reference: van Krieken, J. H., & te Velde, J. (1986). Immunohistology of the human spleen: an inventory of the localization of lymphocyte subpopulations. Histopathology, 10(3), 285–294. https://doi.org/10.1111/j.1365-2559.1986.tb02482.x Steiniger B. S. (2015). Human spleen microanatomy: why mice do not suffice. Immunology, 145(3), 334–346. https://doi.org/10.1111/imm.12469  Steiniger B. S. (2015). Human spleen microanatomy: why mice do not suffice. Immunology, 145(3), 334–346. https://doi.org/10.1111/imm.12469

Code (paste your code in between the ``` symbols)

library(ggplot2)
library(Rtsne)
library(dplyr)
library(patchwork)
set.seed(42)
data <- read.csv("/Users/harriethe/GenomicDataVisualization/genomic-data-visualization-2025/data/codex_spleen_3.csv.gz", row.names=1)
head(data)
pos <- data[, c("x", "y")]
exp <- data[, -(1:3)]
exp_norm <- log10(exp + 1)
total_expression <- rowSums(exp_norm)
hist(total_expression, breaks=50, main="Total Expression Distribution", xlab="Total Expression", col="skyblue")
lc_threshold <- quantile(rowSums(data[, -c(1:3)]), 0.25)
data_filtered <- data[rowSums(data[, -c(1:3)]) >= lc_threshold, ]
dim(data)    
dim(data_filtered)
#pca
expression_data <- data_filtered[, -c(1:3)]  
pcs <- prcomp(expression_data, center = TRUE, scale. = TRUE)
#scree plot
scree_df <- data.frame(PC = 1:length(pcs$sdev), Variance = (pcs$sdev)^2)
scree_plot <- ggplot(scree_df[1:10,], aes(x = PC, y = Variance)) +
  geom_line() +
  geom_point() +
  ggtitle("Scree Plot of PCA") +
  xlab("Principal Components") +
  ylab("Variance Explained") +
  theme_minimal()
print(scree_plot)

ks <- 2:10
wcss <- sapply(ks, function(k) {
  kmeans(pcs$x[, 1:10], centers = k, nstart = 25)$tot.withinss
})
elbow_df <- data.frame(k = ks, WCSS = wcss)
elbow_plot<- ggplot(elbow_df, aes(x = k, y = WCSS)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  ggtitle("Elbow Method for Optimal k") +
  xlab("Number of Clusters (k)") +
  ylab("Within Cluster Sum of Squares") 
# Eh, maybe k=5?
print(elbow_plot)
#scatter plot
pca_df <- data.frame(PC1 = pcs$x[,1], PC2 = pcs$x[,2])
pca_scatter <- ggplot(pca_df, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.5) +
  ggtitle("PCA Projection (PC1 vs PC2)") +
  xlab("Principal Component 1") +
  ylab("Principal Component 2") +
  theme_minimal()
print(pca_scatter)

tsne_input <- pcs$x[, 1:10]  
set.seed(42)
tsne_result <- Rtsne(tsne_input, perplexity = 30, verbose = TRUE, max_iter = 1000)
tsne_df <- data.frame(tSNE1 = tsne_result$Y[,1], tSNE2 = tsne_result$Y[,2])
library(ggplot2)
ggplot(tsne_df, aes(x = tSNE1, y = tSNE2)) +
  geom_point(alpha = 0.6, color = "blue") +
  theme_minimal() +
  ggtitle("t-SNE Projection of CODEx Data") +
  xlab("t-SNE 1") +
  ylab("t-SNE 2")

#if k = 5
k <- 5
kmeans_result <- kmeans(pcs$x[, 1:10], centers = k, nstart = 25)

# Add cluster labels to the dataset
clusters <- as.factor(kmeans_result$cluster)

# Create PCA scatter plot colored by cluster
pca_cluster_df <- data.frame(PC1 = pcs$x[, 1], PC2 = pcs$x[, 2], Cluster = clusters)

pc_plot <- ggplot(pca_cluster_df, aes(x = PC1, y = PC2, col = Cluster)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  ggtitle("PCA Projection Colored by K-means Clusters") +
  xlab("Principal Component 1") +
  ylab("Principal Component 2")
print(pc_plot)

spatial_df <- data.frame(x = data_filtered$x, y = data_filtered$y, Cluster = clusters)

spatial_plot <- ggplot(spatial_df, aes(x = x, y = y, col = Cluster)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  ggtitle("Spatial Distribution of Clusters") +
  xlab("X Coordinate") +
  ylab("Y Coordinate")
print(spatial_plot)

# tsne cluster

tsne_df$Cluster <- as.factor(clusters) 
tsne_plot <- ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = Cluster)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  ggtitle("t-SNE Projection of CODEx Data (Colored by Clusters)") +
  xlab("t-SNE 1") +
  ylab("t-SNE 2") 

print(tsne_plot)
#calculate wilcoxon test
de_results <- list()
for (i in levels(clusters)) {
  cluster_cells <- names(clusters)[clusters == i]
  other_cells <- names(clusters)[clusters != i]
  
  p_values <- apply(data_filtered[, -c(1:3)], 2, function(gene) {
    wilcox.test(gene[cluster_cells], gene[other_cells], alternative = 'two.sided')$p.value
  })
  
  log2FC <- apply(data_filtered[, -c(1:3)], 2, function(gene) {
    mean_cluster <- mean(gene[cluster_cells])
    mean_other <- mean(gene[other_cells])
    log2(mean_cluster / mean_other)
  })
  
  de_results[[i]] <- data.frame(Gene = names(p_values), Log2FC = log2FC, P_Value = p_values) %>%
    arrange(P_Value) %>% 
    mutate(Cluster = i)
}

de_markers <- bind_rows(de_results)
de_markers <- de_markers %>%
  mutate(Significant = ifelse(P_Value < 0.01 & abs(Log2FC) > 1, "Yes", "No"))
top_markers <- de_markers %>%
  filter(Significant == "Yes") %>%
  group_by(Cluster) %>%
  slice_min(P_Value, n = 10)
volcano_plots <- list()
unique_clusters <- unique(de_markers$Cluster)
for (cl in unique_clusters) {
  cluster_data <- subset(de_markers, Cluster == cl) 
  cluster_data$Regulation <- ifelse(cluster_data$Log2FC > 1 & cluster_data$P_Value < 0.01, "Upregulated",
                             ifelse(cluster_data$Log2FC < -1 & cluster_data$P_Value < 0.01, "Downregulated", "Not Significant"))
  top_genes <- cluster_data[order(cluster_data$P_Value), ][1:10, ]  

  
  volcano_plot <- ggplot(cluster_data, aes(x = Log2FC, y = -log10(P_Value), col = Regulation)) +  
    geom_point(alpha = 0.6) +
    scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "gray")) +  
    theme_minimal() +
    ggtitle(paste("Volcano Plot of Cluster", cl)) +
    xlab("Log2 Fold Change") +
    ylab("-log10(P-Value)") +
    geom_hline(yintercept = -log10(0.01), linetype = "dashed", color = "blue") +
    geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "red") +
    geom_text_repel(data = top_genes, aes(label = Gene, col = Regulation), size = 3, box.padding = 0.5, max.overlaps = 15)
  volcano_plots[[paste0("volcano_cluster", cl)]] <- volcano_plot
}
names(volcano_plots)

final_plot <- (scree_plot / pc_plot) | (spatial_plot / tsne_plot) | 
              (volcano_plots$volcano_cluster1 / 
               volcano_plots$volcano_cluster2  / 
               volcano_plots$volcano_cluster3 / 
               volcano_plots$volcano_cluster4  /  
               volcano_plots$volcano_cluster5)
png("hw5_jhe46.png", width = 5000, height = 4000, res = 300)
print(final_plot)
dev.off()

27 Feb 2025

« Identifying the Tissue Cluster characterization in sequencing-based spatial transcriptomics »