Comparing the effect of tSNE on varying number of PCs:KRT7 expression (using gganimate)


Archana Balan
I ama a first year Biomedical Engineering PhD Student at Johns Hopkins University. My research interests include developing computational methods for cancer immunogenomics

Comparing the effect of tSNE on varying number of PCs:KRT7 expression (using gganimate)

If I perform non-linear dimensionality reduction on PCs, what happens when I vary how many PCs should I use?

I am visualizing the effect of performing tSNE on varying number of PCs. Similar to my homework 3 visualization I have visulaized the KRT7 gene expression in the reduced dimensionality space. The gene expression was normalized (by total gene expression for each cell) prior to PCA. t-SNE was then performed by using all PCs, first 30 PCs only, first 10 PCs only, first 4 PCs only and first 2PCs only. Using gganimate I have shown the transition of performing tSNE starting with all PCs and then reduced number of PCs. The animation helps emphasize the difference in the embeddings as we reduce the number of PCs.

library(dplyr)
library(tidyverse)
library(ggplot2)
library(Rtsne)
library(patchwork)
library(viridis)
library(gganimate)


## Data Description
# Each row is a cell ~17000
# Columns are mainly genes ~300

# Set working directory path and read in data
setwd("/Users/archanabalan/Karchin Lab Dropbox/Archana Balan/Coursework/Spring2024/GDV/homework/")
data = read.csv("./data/pikachu.csv.gz", row.names =1)

# Drop first 5 columns for gene expression (cell_id, cell_area etc)
gexp = data[, -c(1:5)]
rownames(gexp) <- data$cell_id

# normalize data by total gene expression for each cell
gexpnorm = gexp/rowSums(gexp)
# check if normalization is correct (value would be 1 for all cells)
unique(rowSums(gexpnorm) )


# PCA using prcomp function
# Center = TRUE and scale = FALSE params
pcs = prcomp(gexpnorm, center = TRUE, scale. = FALSE)

# tSNE on first 2 PCs
tsne_2 = Rtsne(pcs$x[, 1:2], dims = 2, pca = FALSE)

# tSNE on first 4 PCs
tsne_4 = Rtsne(pcs$x[, 1:4], dims = 2, pca = FALSE)

# tSNE on first 10 PCs
tsne_10 = Rtsne(pcs$x[, 1:10], dims = 2, pca = FALSE)

# tSNE on first 30 PCs
tsne_30 = Rtsne(pcs$x[, 1:30], dims = 2, pca = FALSE)

# tSNE on all PCs
tsne_all = Rtsne(pcs$x, dims = 2, pca = FALSE)


plot.df <- rbind(data.frame(tsne_x = tsne_2$Y[,1],
                 tsne_y = tsne_2$Y[,2], 
                 KRT7 = gexpnorm$KRT7, 
                 order = "TSNE on first 2 PCS"),
            data.frame(tsne_x = tsne_4$Y[,1],
                       tsne_y = tsne_4$Y[,2], 
                       KRT7 = gexpnorm$KRT7, 
                       order = "TSNE on first 4 PCS"), 
            data.frame(tsne_x = tsne_10$Y[,1],
                       tsne_y = tsne_10$Y[,2], 
                       KRT7 = gexpnorm$KRT7, 
                       order = "TSNE on first 10 PCS"),
            data.frame(tsne_x = tsne_30$Y[,1],
                       tsne_y = tsne_30$Y[,2], 
                       KRT7 = gexpnorm$KRT7, 
                       order = "TSNE on first 30 PCS"),
            data.frame(tsne_x = tsne_all$Y[,1],
                       tsne_y = tsne_all$Y[,2], 
                       KRT7 = gexpnorm$KRT7, 
                       order = "TSNE on all PCS")) %>% 
            mutate(order = factor(order, levels = rev(unique(order))))


# Ref: https://stackoverflow.com/questions/37397303/change-label-of-gganimate-frame-title

p1 <- ggplot(data = plot.df, aes (x=tsne_x, y = tsne_y, col = KRT7) ) +
  geom_point(size =0.9) +
  theme_classic() + 
  scale_color_viridis(option="plasma") +
  labs(x = "X", y = "Y") 

anim <- p1 + 
  transition_states(order) +
  view_follow() +
  labs(title = '{closest_state}') 

# Ref: https://stackoverflow.com/questions/52244347/control-speed-of-a-gganimation

animate(anim, height = 400, width = 400, fps = 5)

anim_save("hwEC1_abalan2.gif", animation = last_animation(), path = "./")