Identifying tissue structure in spleen tissue sample

Hi my name is Ria and I'm a junior BME student.

Identifying tissue structure in spleen tissue sample

1. Written Answer

I decided to use techniques such as t-SNE and dimensionality reduction as well as normalizing the protein expression data to figure out the tissue structure in the CODEX sample. I found that using 5 clusters (determined by the elbow method) and Wilcoxon tests to find the unregulated proteins in clusters 4 and 5 worked well. For cluster 4, I found CD15, CD3e, CD45, CD21, and ECAD to be upregulated and I examined CD15 in more detail. CD15 is a granulocyte marker found on some dendritic cells, CD3e is a component of the T cell receptor complex, and CD21 is a B cell and dendritic cell marker, so this cluster is likely a mix of T cells, B cells, and other antigen presenting cells, and since white pulp contains lymphocytes, I believe this is white pulp. For cluster 5, I found CD107a, Podoplanin, PanCK, CD21, CD35, and HLA-DR to be upregulated. Here, CD107a is a marker on activated T cells and CD21 and CD35 are both expressed on B cells and dendritic cells. This means that this is likely activated immune cells like dendritic, B cells, and T cells, which supports the white pulp conclusion I made from cluster 4.

Sources Springer, T. A., et al. (1981). Granulocyte marker CD15 and its role in immune responses. Journal of Experimental Medicine Weiss, A., et al. (1984). The role of CD3 in TCR signaling. Nature Fearon, D. T. (1993). CD21: Complement receptor 2 and B cell activation. Immunological Reviews Mebius, R. E., & Kraal, G. (2005). Structure and function of the spleen white pulp. Nature Reviews Immunology Steinman, R. M., & Cohn, Z. A. (1973). Dendritic cells and antigen presentation in white pulp. The Journal of Experimental Medicine

2. Code (paste your code in between the ``` symbols)

Note: I used the class code as a base and used ChatGPT for graph formatting and debugging some functions.

library(ggplot2)
library(patchwork)
library(Rtsne)
library(dplyr)
library(reshape2)

# Read and preprocess data
file_path <- '~/Desktop/genomic-data-visualization-2025/data/codex_spleen_3.csv.gz'
data <- read.csv(file_path, row.names = 1)
head(data)

# Extract spatial positions 
pos <- data[, 1:2]
colnames(pos) <- c("X", "Y")

# Extract area data (column 3)
area <- data[, 3]

# Extract protein expression data
gexp <- data[, 4:ncol(data)]

# Normalize gene expression (log transformation)
norm_gexp <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp)) + 1)

# Identify the most variable genes (for feature selection)
top_variable_genes <- sort(apply(norm_gexp, 2, var), decreasing=TRUE)
top_variable_genes

# Perform t-SNE for dimensionality reduction
set.seed(42)
tsne_res <- Rtsne(norm_gexp, perplexity=30)
tsne_df <- data.frame(tsne_res$Y)
colnames(tsne_df) <- c("tSNE1", "tSNE2")

# Perform k-means clustering (based on elbow method)
wcss <- sapply(2:15, function(k) {
  kmeans_result <- kmeans(tsne_df, centers=k)
  return(kmeans_result$tot.withinss)
})

# Generate elbow plot to determine optimal K
elbow_plot <- ggplot(data.frame(K=2:15, WCSS=wcss), aes(x=K, y=WCSS)) +
  geom_line() + geom_point() +
  theme_minimal() +
  ggtitle("Elbow Method for Optimal K") +
  xlab("Number of Clusters (K)") +
  ylab("Within-Cluster Sum of Squares")
elbow_plot

# Select optimal K
optimal_k <- 5
clusters <- kmeans(tsne_df, centers=optimal_k)$cluster
tsne_df$Cluster <- as.factor(clusters)

# Map clusters to spatial positions
df_pos <- data.frame(pos, Cluster=as.factor(clusters), Area=area)

# Identify the top differentially expressed gene per cluster
marker_genes <- aggregate(norm_gexp, by=list(Cluster=clusters), FUN=mean)
rownames(marker_genes) <- marker_genes$Cluster
marker_genes <- marker_genes[, -1]
top_genes <- apply(marker_genes, 1, function(x) names(which.max(x)))
df_top_genes <- data.frame(Cluster=rownames(marker_genes), TopGene=top_genes)

# Select a specific cluster for analysis
selected_cluster <- 4
top_gene <- df_top_genes$TopGene[df_top_genes$Cluster == selected_cluster]

# Create cluster-specific data
cluster_tsne <- tsne_df
cluster_tsne$Highlight <- ifelse(cluster_tsne$Cluster == selected_cluster, "In Cluster", "Other")

cluster_pos <- df_pos
cluster_pos$Highlight <- ifelse(cluster_pos$Cluster == selected_cluster, "In Cluster", "Other")

# Gene expression mapping for the selected cluster
gene_exp_df <- data.frame(pos, Expression=norm_gexp[, top_gene])

# Create plots

# 1. Cluster visualization in t-SNE space
p1 <- ggplot(cluster_tsne, aes(x=tSNE1, y=tSNE2, color=Highlight)) +
  geom_point(size=0.02) +
  scale_color_manual(values=c("purple", "gray")) +
  theme_minimal() +
  ggtitle(paste("Cluster", selected_cluster, "in t-SNE"))
p1

# 2. Expression of the top gene in t-SNE
p2 <- ggplot(tsne_df, aes(x=tSNE1, y=tSNE2, color=norm_gexp[, top_gene])) +
  geom_point(size=0.01) +
  scale_color_viridis_c() +
  theme_minimal() +
  ggtitle(paste(top_gene, "Expression Relative to t-SNE"))
p2

# 3. Spatial distribution of cluster
p3 <- ggplot(cluster_pos, aes(x=X, y=Y, color=Highlight)) +
  geom_point(size=0.01) +
  scale_color_manual(values=c("purple", "gray")) +
  theme_minimal() +
  ggtitle(paste("Cluster", selected_cluster, "in Space"))
p3

# 4. Gene expression across tissue sample
p4 <- ggplot(gene_exp_df, aes(x=X, y=Y, color=Expression)) +
  geom_point(size=0.02) +
  scale_color_viridis_c(name="CD15 Expression") +
  theme_minimal() +
  ggtitle(paste(top_gene, "Expression Across \n Tissue Sample"))
p4

# 5. Bar plot of upregulated genes in the cluster
top_cluster_gene_values <- sort(t(marker_genes[selected_cluster, ]), decreasing=TRUE)[1:5]
top_cluster_gene_names <- colnames(marker_genes)[order(t(marker_genes[selected_cluster, ]), decreasing=TRUE)[1:5]]
df_upregulated <- data.frame(Gene=top_cluster_gene_names, Expression=top_cluster_gene_values)

p5 <- ggplot(df_upregulated, aes(x=Gene, y=Expression, fill="In Cluster")) +
  geom_bar(stat="identity", position="dodge", linewidth=0.1) +
  scale_fill_manual(values="purple") +
  theme_minimal() +
  ggtitle(paste("Upregulated Proteins \n in Cluster", selected_cluster)) +
  xlab("Gene") + ylab("Average Normalized Expression")
p5

# Repeat for a second cluster
selected_cluster_2 <- 5
top_gene_2 <- df_top_genes$TopGene[df_top_genes$Cluster == selected_cluster_2]

cluster_tsne_2 <- tsne_df
cluster_tsne_2$Highlight <- ifelse(cluster_tsne_2$Cluster == selected_cluster_2, "In Cluster", "Other")

cluster_pos_2 <- df_pos
cluster_pos_2$Highlight <- ifelse(cluster_pos_2$Cluster == selected_cluster_2, "In Cluster", "Other")

gene_exp_df_2 <- data.frame(pos, Expression=norm_gexp[, top_gene_2])

p6 <- ggplot(cluster_tsne_2, aes(x=tSNE1, y=tSNE2, color=Highlight)) +
  geom_point(size=0.01) +
  scale_color_manual(values=c("red", "gray")) +
  theme_minimal() +
  ggtitle(paste("Cluster", selected_cluster_2, "in t-SNE"))
p6

p7 <- ggplot(tsne_df, aes(x=tSNE1, y=tSNE2, color=norm_gexp[, top_gene_2])) +
  geom_point(size=0.01) +
  scale_color_viridis_c(name="CD107a Expression") +
  theme_minimal() +
  ggtitle(paste(top_gene_2, "Expression Relative to t-SNE"))
p7

p8 <- ggplot(cluster_pos_2, aes(x=X, y=Y, color=Highlight)) +
  geom_point(size=.01) +
  scale_color_manual(values=c("red", "gray")) +
  theme_minimal() +
  ggtitle(paste("Cluster", selected_cluster_2, "in Space"))
p8

p9 <- ggplot(gene_exp_df_2, aes(x=X, y=Y, color=Expression)) +
  geom_point(size=0.01) +
  scale_color_viridis_c() +
  theme_minimal() +
  ggtitle(paste(top_gene_2, "Expression Across \n Tissue Sample"))
p9

top_cluster_gene_values_2 <- sort(t(marker_genes[selected_cluster_2, ]), decreasing=TRUE)[1:5]
top_cluster_gene_names_2 <- colnames(marker_genes)[order(t(marker_genes[selected_cluster_2, ]), decreasing=TRUE)[1:5]]
df_upregulated_2 <- data.frame(Gene=top_cluster_gene_names_2, Expression=top_cluster_gene_values_2)

p10 <- ggplot(df_upregulated_2, aes(x=Gene, y=Expression, fill="In Cluster")) +
  geom_bar(stat="identity", position="dodge") +
  scale_fill_manual(values="red") +
  theme_minimal() +
  ggtitle(paste("Upregulated Proteins \n in Cluster", selected_cluster_2)) +
  xlab("Gene") + ylab("Average Normalized Expression")
p10

# Arrange all plots
increase_margin <- theme(plot.margin = margin(15, 15, 15, 15))  
p1 <- p1 + increase_margin
p2 <- p2 + increase_margin
p3 <- p3 + increase_margin
p4 <- p4 + increase_margin
p5 <- p5 + increase_margin
p6 <- p6 + increase_margin
p7 <- p7 + increase_margin
p8 <- p8 + increase_margin
p9 <- p9 + increase_margin
p10 <- p10 + increase_margin

top <- p1 + p2 + p3 + p4 + p5 + plot_layout(ncol = 5, heights = c(2, 1))
bottom <- p6 + p7 + p8 + p9 + p10 + plot_layout(ncol=5, heights = c(2, 1))
top/bottom

27 Feb 2025

« Analysis of Cellular Clusters and Marker Expression in the CODEX Dataset Analyzing Immune Cell Clusters in CODEX Dataset »