Animating the tSNE embedding of CODEX spleen dataset

I'm a current senior in BME, focusing in Biomedical Data Science. In my free time, I enjoy exploring coffee shops, reading, and going to group workout classes!

Animating the tSNE embedding of CODEX spleen dataset

I thought it would be fun to animate the transition from normal space (our x/y positional coordinates) to the tSNE embedded space. Thus, after normalizing my data, I first performed kmeans clustering on the protein levels and visualized these in space. Then, I plotted my points in the tSNE space and colored them by the same cluster labels. Finally, I used gganimate. Doing so helped me better appreciate a few things. I was able to better visualize how tSNE preserves local clusterings by seeing how the points close to one another ended up together in tSNE space. Second, I saw how some initial separation of two clusters in tSNE space could further drive those clusters to be far apart, even if they were close together in physical space. This further affirms how separations between clusters in tSNE embeddings do not necessarily represent how similar/dissimilar those clusters are. Finally, I also animated the transition from normal space to PC space (code is included below), which evidently showed how PCA can be viewed as a projection of the data onto another set of coordinates. This is in stark contrast to tSNE, where there is more of a “smooth” transition from the normal to the tSNE space—consistent with the method of finding a lower-dimensional representation by “placing” points based on a probability distribution.

References:

https://gganimate.com/reference/transition_states.html
https://stackoverflow.com/questions/51440496/using-gganimate-to-export-gif

Code (paste your code in between the ``` symbols)

library(ggplot2)
library(patchwork)
library(Rtsne)
library(gganimate)
library(gifski)
library(dplyr)

#### read in data and normalize ####
data <- read.csv('~/Desktop/GDV/genomic-data-visualization-2025/data/codex_spleen_3.csv.gz')
pos <- data[,2:3]
areas <- data[,4]
prots <- data[,5:ncol(data)]
rownames(pos) <- data[,1]
rownames(prots) <- data[,1]

#### set ggplot theme/formatting details
ggtheme <- theme_classic() +
  theme(plot.title = element_text(size = 8), 
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8), 
        legend.title = element_text(size = 8))

# using library size normalization #
norm_prots <- prots/rowSums(prots)

#### look for overall highest vals, formulate hypotheses ####
tot_prots <- colSums(prots)
names(tot_prots) <- colnames(prots)
tot_prots <- sort(tot_prots)

#### visualize spots and kmeans clusters ####

# kmeans
set.seed(10)
ks <- c(2,3,4,5,6,7,8,9,10,11,12)
totws <- sapply(ks, function(k) {
  print(k)
  clus <- kmeans(norm_prots, centers = k)
  return(clus$tot.withinss)
})

totws_df <- data.frame(k = ks, totw = totws)

# elbow plot
elbow_plt <- ggplot(totws_df, aes(x = k, y = totw)) + 
  geom_point() + ggtheme +
  labs(x = 'k', y = 'Total Withiness', 
       title = 'Elbow plot for determining # of clusters')

# using labels w/ 6 clusters 
clus_labs <- (kmeans(norm_prots, centers = 6))$cluster
clus_labs <- as.factor(clus_labs)

# plotting spots in physical space, colored by cluster
clus_in_space <- ggplot(pos, aes(x = x, y = y, 
                                 color = clus_labs)) + 
  geom_point(size = 1) + ggtheme +
  scale_color_brewer(palette="Dark2") + 
  labs(x = 'x position', y = 'y position', 
       title = 'Spots colored by assigned cluster', color = 'Cluster label') 

print(clus_in_space)

#### PCA ####

pcs <- prcomp(norm_prots)
pcs_df <- data.frame(pcs$x) 

pcs_plt <- ggplot(pcs_df, aes(x = PC1, y = PC2, color = clus_labs)) + 
  geom_point() + ggtheme +
  scale_color_brewer(palette="Dark2") +
  labs(title = 'Top Two PCs, colored by cluster labels', 
       color = 'Cluster label')

print(pcs_plt)

#### tSNE ####

emb <- Rtsne(norm_prots)

tsne_df <- data.frame(tSNE1 = emb$Y[,1], tSNE2 = emb$Y[,2])
tsne_plt <- ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = clus_labs)) + 
  geom_point() + ggtheme + 
  scale_color_brewer(palette="Dark2") +
  labs(title = 't-SNE plot, colored by cluster labels', 
       color = 'Cluster label')

print(tsne_plt)

#### gganimate ####

pos$state <- 0
pos$clus_labs <- clus_labs

pcs_df_abr <- pcs_df[,1:2]
pcs_df_abr$state <- 1
pcs_df_abr$clus_labs <- clus_labs

tsne_df$state <- 2
tsne_df$clus_labs <- clus_labs

pos <- pos %>% 
  rename('comp1' = x,
         'comp2' = y)

pcs_df_abr <- pcs_df_abr %>% 
  rename('comp1' = PC1,
         'comp2' = PC2)

tsne_df <- tsne_df %>% 
  rename('comp1' = tSNE1,
         'comp2' = tSNE2)

# plt1
trans_df <- rbind(pcs_df_abr, tsne_df)
trans_plt <- ggplot(trans_df, aes(x = comp1, y = comp2, 
                                  color = clus_labs)) + 
  geom_point() + ggtheme +
  scale_color_brewer(palette="Dark2")

move_plt <- trans_plt + transition_states(state) + view_follow()

# plt2
trans_df2 <- rbind(pos, pcs_df_abr)
trans_plt2 <- ggplot(trans_df2, aes(x = comp1, y = comp2, 
                                  color = clus_labs)) + 
  geom_point() + ggtheme +
  scale_color_brewer(palette="Dark2")

move_plt2 <- trans_plt2 + transition_states(state, 
                                            transition_length = 15,
                                            state_length = 5) + view_follow()

# plt3
trans_df3 <- rbind(pos, tsne_df)
trans_plt3 <- ggplot(trans_df3, aes(x = comp1, y = comp2, 
                                    color = clus_labs)) + 
  geom_point() + ggtheme +
  scale_color_brewer(palette="Dark2")

move_plt3 <- trans_plt3 + transition_states(state) + view_follow()

anim_save('~/Desktop/GDV/sabahatrahman.gif', move_plt3)

07 Mar 2025

HW EC1

« Eevee Dataset Deconvolution and Cell-type Analysis