Characterizing Tissue Structure Identity of CODEX Dataset using K-means Clustering and Differential Expression Analysis

I'm a BME Freshman. I love to run, workout, and play piano!

Characterizing Tissue Structure Identity of CODEX Dataset using K-means Clustering and Differential Expression Analysis

Description of Data Visualization:

The CODEX dataset was normalized (log10(CPM+1)) by library size and its dimensionality was reduced using tSNE (seed(123)). The normalized dataset was also reduced using PCA, from which the top 10 PCs were used to conduct k-means clustering (k=4). K=4 was chosen because the scree plot showed an elbow around k=4-6, and a silhouette plot showed k=4 had the highest average silhouette between those three numbers.

A one-sided Wilcox rank-sum test was performed for all proteins in all clusters, and the top 4 differentially expressed proteins for each cluster was identified. The two clusters that were chosen for further analysis were clusters 2 and 3, which were chosen to analyze data from the central and peripheral regions of the tissue sample, based on the distribution of the clusters when plotted in xy space.

The most notable protein expressed by cluster 2 is CD8, which is a hallmark sign of CD8+ T cells (Shieh et al., 2014). The process of discerning the identity of cluster 3 was less deterministic and more holistic, where the thermofisher immune cell guide was used to find that all top three differentially expressed proteins (CD20, CD21, and CD35) in cluster 3 were characteristic of B cells.

Based on the cell types of clusters 2 and 3 (CD8+ T cells and B cells) the researcher determined that the tissue structure type of the CODEX dataset was white pulp, which contains these essential immune cells in abundance (Elmore S. A., 2006).

Sources:

Shieh, S. J., Varkey, P., Chen, P. Y., Chang, S. Y., & Huang, L. L. (2014). Counting CD4(+) and CD8(+) T cells in the spleen: a novel in vivo method for assessing biomaterial immunotoxicity. Regenerative biomaterials, 1(1), 11–16. https://doi.org/10.1093/rb/rbu003

Thermofisher immune cell guide: https://documents.thermofisher.com/TFS-Assets/LSG/brochures/immune-cell-guide.pdf

Elmore S. A. (2006). Enhanced histopathology of the spleen. Toxicologic pathology, 34(5), 648–655. https://doi.org/10.1080/01926230600865523

Code:

## PROMPT: move the cluster color legend to become a column to the right of the spatial plot, 
## change the point size to 2, and add a legend for the density plots to the bottom

## read in data
data <- read.csv('/Users/henryaceves/Desktop/JHU/S2/GDV/GDV datasets/codex_spleen2.csv.gz')
dim(data)

## ggplot
library(ggplot2)
library(patchwork)
library(cowplot)

## create data frame
pos <- data[,c('x','y')]
pexp <- data[,5:ncol(data)]
rownames(pos) <- rownames(pexp) <- data[,1]
area <- data[,4]
names(area) <- data[,1]
head(area)
head(pos)
head(pexp)
totpexp <- rowSums(pexp)
mat <- log10(pexp/totpexp*1e6+1)
pcs <- prcomp(mat, center = TRUE, scale = FALSE)
plot(pcs$sdev)
toppcs <- pcs$x[, 1:10]

## tSNE function (pulled from quiz review)
runTSNE <- function(mat) { ## you can write functions to help, it's ok to runTSNE on raw data but you can run it on linear dim
  ## reduced data so that it runs faster
  emb <- Rtsne::Rtsne(mat)
  embedding <- emb$Y
  colnames(embedding) <- c('tSNE1','tSNE2')
  rownames(embedding) <- rownames(mat)
  return(embedding)
}

embedding <- runTSNE(toppcs)
df <- data.frame(embedding)
ggplot(df, aes(x= tSNE1, y=tSNE2)) + geom_point(size=1, alpha=0.2)
dim(embedding)

## k means

# Define range of k values to test
k_values <- 1:15

# Calculate total within-cluster sum of squares for each k
withinness <- sapply(k_values, function(k) {
  kmeans(toppcs, centers = k, nstart = 10)$tot.withinss
})

# Create data frame for plotting
elbow_df <- data.frame(
  k = k_values,
  withinness = withinness
)

# Plot the elbow curve --> elbow @ around 4-5 clusters. When plotting a silhouette plot, average width tanks when k=5, so we choose k=4
ggplot(elbow_df, aes(x = k, y = withinness)) +
  geom_point() +
  geom_line() +
  labs(
    x = "Number of Clusters (k)",
    y = "Total Within-Cluster Sum of Squares",
    title = "Elbow Plot for K-Means Clustering"
  ) +
  scale_x_continuous(breaks = k_values) +
  theme_minimal()

## silhouette plot
library(cluster)

sil_width <- sapply(2:15, function(k) {
  km <- kmeans(toppcs, centers = k, nstart = 10)
  ss <- silhouette(km$cluster, dist(toppcs))
  mean(ss[, 3])
})

plot(2:15, sil_width, type = "b",
     xlab = "Number of Clusters (k)",
     ylab = "Average Silhouette Width")

# clustering
set.seed(123)
clusters <- as.factor(kmeans(toppcs, center=4)$cluster)
df <- data.frame(embedding, pos, clusters)

# Define colors for clusters
cluster_colors <- c("1" = "#DC0000", "2" = "#00A087", "3" = "#3C5488", "4" = "#F39B7F")

## Row 1: tSNE and Spatial plots
p1 <- ggplot(df, aes(x = tSNE1, y = tSNE2, col = clusters)) +
  geom_point(size = 1, alpha = 0.3) +
  scale_color_manual(values = cluster_colors) +
  labs(title = "tSNE Space", col = "Cluster") +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
    axis.text = element_text(color = "black"),
    legend.position = "none"
  )

p2 <- ggplot(df, aes(x = x, y = y, col = clusters)) +
  geom_point(size = 1, alpha = 0.3) +
  scale_color_manual(values = cluster_colors) +
  labs(title = "Spatial", col = "Cluster") +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
    axis.text = element_text(color = "black"),
    legend.position = "none"
  )

## differential protein expression analysis - asked Claude to loop through all clusters
results <- lapply(1:4, function(i) {
  clusterofinterest <- names(clusters)[clusters == i]
  othercells <- names(clusters)[clusters != i]
  
  out <- sapply(colnames(mat), function(protein) {
    x1 <- mat[clusterofinterest, protein]
    x2 <- mat[othercells, protein]
    wilcox.test(x1, x2, alternative = 'greater')$p.value
  })
  
  sort(out)
})

names(results) <- paste0("Cluster_", 1:4)

# View top proteins for each cluster
lapply(results, head)

# Colors for density plots
density_cluster_colors <- c("#DC0000", "#00A087", "#3C5488", "#F39B7F")

## asked Claude to help make density plots for top proteins of each cluster
## asked Claude to add p-values to the density plots (upper left, except Ki67 upper right)
## asked Claude to add CPM units to the density plots
# Function to create density plot with p-value and CPM units
create_density_plot <- function(clust, protein_rank) {
  protein <- names(results[[clust]])[protein_rank]
  pval <- results[[clust]][protein_rank]
  
  # Format p-value for display
  if (pval < 2.2e-16) {
    pval_text <- "p < 2.2e-16"
  } else {
    pval_text <- paste0("p = ", formatC(pval, format = "e", digits = 2))
  }
  
  df_hist <- data.frame(
    expression = mat[, protein],
    group = ifelse(clusters == clust, paste0("Cluster ", clust), "Other Clusters")
  )
  
  # Set p-value position based on protein name (Ki67 upper right, others upper left)
  if (protein == "Ki67") {
    x_pos <- Inf
    hjust_val <- 1.1
  } else {
    x_pos <- -Inf
    hjust_val <- -0.1
  }
  
  ggplot(df_hist, aes(x = expression, fill = group)) +
    geom_density(alpha = 0.5) +
    scale_fill_manual(values = setNames(
      c(density_cluster_colors[clust], "#4DBBD5"),
      c(paste0("Cluster ", clust), "Other Clusters")
    )) +
    labs(title = paste0("Cluster ", clust, ": ", protein),
         x = expression(log[10](CPM + 1)),
         y = "Density",
         fill = "") +
    annotate("text", x = x_pos, y = Inf, label = pval_text,
             hjust = hjust_val, vjust = 1.5, size = 3, fontface = "italic") +
    theme_classic() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 10),
      axis.text = element_text(color = "black", size = 8),
      axis.title = element_text(size = 9),
      legend.position = "none"
    )
}

## asked claude ============================================================
## 4x4 Plot: Top 4 Differentially Expressed Proteins per Cluster
## ============================================================

# Create a list to store ALL individual plots for 4x4
all_plots_4x4 <- list()
plot_index <- 1

# Loop through all 4 clusters
for (clust in 1:4) {
  
  # Get top 4 proteins for this cluster
  top4_proteins <- names(results[[clust]])[1:4]
  
  # Create density plots for each protein
  for (i in 1:4) {
    all_plots_4x4[[plot_index]] <- create_density_plot(clust, i)
    plot_index <- plot_index + 1
  }
}

# Combine all 16 plots into one big figure (4 rows x 4 columns)
# Each row = one cluster, each column = one of top 4 proteins
combined_plot_4x4 <- wrap_plots(all_plots_4x4, nrow = 4, ncol = 4, guides = "collect") +
  plot_annotation(
    title = "Top 4 Differentially Expressed Proteins per Cluster",
    theme = theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16))
  ) &
  theme(legend.position = "bottom")

# Print the 4x4 combined plot
print(combined_plot_4x4)

## asked claude ============================================================
## Final Plot: tSNE/Spatial + Top 3 Proteins for Clusters 2 & 3
## ============================================================

## asked Claude to make the plot only have 2 rows under the first two graphs,
## those rows being the top 3 expressed proteins of cluster 2 and cluster 3

## Row 2: Top 3 proteins for Cluster 2
c2_p1 <- create_density_plot(2, 1)
c2_p2 <- create_density_plot(2, 2)
c2_p3 <- create_density_plot(2, 3)

## Row 3: Top 3 proteins for Cluster 3
c3_p1 <- create_density_plot(3, 1)
c3_p2 <- create_density_plot(3, 2)
c3_p3 <- create_density_plot(3, 3)

## Create a dummy plot to extract the CLUSTER legend (for spatial plot)
cluster_legend_plot <- ggplot(df, aes(x = tSNE1, y = tSNE2, col = clusters)) +
  geom_point(size = 2) +
  scale_color_manual(values = cluster_colors, name = "Cluster") +
  theme_classic() +
  theme(
    legend.position = "right",
    legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 10)
  ) +
  guides(color = guide_legend(override.aes = list(size = 4, alpha = 1), ncol = 1))

# Extract cluster legend
cluster_legend <- get_legend(cluster_legend_plot)

## Create a dummy plot to extract the DENSITY legend (for density plots)
## Updated to show actual colors for Cluster 2 and Cluster 3
density_legend_df <- data.frame(
  expression = rep(1:4, 3),
  group = factor(
    c(rep("Cluster 2", 4), rep("Cluster 3", 4), rep("Other Clusters", 4)),
    levels = c("Cluster 2", "Cluster 3", "Other Clusters")
  )
)

density_legend_plot <- ggplot(density_legend_df, aes(x = expression, fill = group)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(
    values = c(
      "Cluster 2" = "#00A087",
      "Cluster 3" = "#3C5488", 
      "Other Clusters" = "#4DBBD5"
    ),
    name = "Density Groups"
  ) +
  theme_classic() +
  theme(
    legend.position = "bottom",
    legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 10)
  ) +
  guides(fill = guide_legend(override.aes = list(alpha = 0.5), nrow = 1))

# Extract density legend
density_legend <- get_legend(density_legend_plot)

## Build the layout using plot_grid for precise control

# Row 1: tSNE, Spatial, and Cluster Legend (legend as column to the right)
row1 <- plot_grid(p1, p2, cluster_legend, ncol = 3, rel_widths = c(1, 1, 0.3))

# Row 2: Cluster 2 (3 plots) = 3 columns
row2 <- plot_grid(c2_p1, c2_p2, c2_p3, ncol = 3)

# Row 3: Cluster 3 (3 plots) = 3 columns
row3 <- plot_grid(c3_p1, c3_p2, c3_p3, ncol = 3)

# Combine all rows
all_rows <- plot_grid(
  row1, row2, row3,
  ncol = 1,
  rel_heights = c(1.2, 1, 1)
)

# Add title (two lines)
title <- ggdraw() +
  draw_label(
    "Characterizing Tissue Structure Identity of CODEX Dataset\nusing K-means Clustering and Differential Expression Analysis",
    fontface = 'bold',
    size = 14,
    x = 0.5,
    hjust = 0.5
  )

# Combine title, plots, and density legend at bottom
final_plot <- plot_grid(
  title,
  all_rows,
  density_legend,
  ncol = 1,
  rel_heights = c(0.08, 1, 0.05)
)

# Print the final plot
print(final_plot)

# from the graphs given, CD8 in cluster 2 and CD20 in cluster 3 are good proteins to focus on

20 Feb 2026

« HW 5 HW5 »