Hw4: Finding the same cell cluster in the other dataset

Hi, my name is Anishka and I am a BME junior. I am very excited for this class!

Hw4: Finding the same cell cluster in the other dataset

This panel shows that the cell cluster that I found in the EEVEE dataset is the same that I had found in the Pikachu dataset for the previous homework. The first graph in the panel shows all the spots in the EEVEE dataset in the PCA space. I chose cluster 2 to look at more closely because the spatial distribution of this cluster in the EEVEE dataset was very similar to the spatial distribution of cluster 3 in the Pikachu dataset. This similarity can be viewed by graphs 2 and 5 in the panel. Since the EEVEE dataset is from sequencing data, it’s not at the single molecule level so the spots are more generalized than the Pikachu data. Next, I compared the gene expressions of both datasets at their respective clusters using a volcano plot. The plots came out very similar, with the same three genes being significantly differentially expressed between the cells of interest and others. Using these observations, I came to the conclusion that the cells in cluster 2 in the EEVEE dataset are of the same type of the cells in cluster 3 in the Pikachu dataset.

5. Code (paste your code in between the ``` symbols)

## EEVEE
file <- '/Users/anishkabhartiya/Desktop/genomic-data-visualization-2025/data/eevee.csv.gz'
data <- read.csv(file)
data[1:5, 1:10]

pos <- data[, 3:4]
rownames(pos) <- data$barcode
head(pos)
gexp <- data[, 5:ncol(data)]
rownames(gexp) <- data$barcode
gexp[1:5,1:5]
dim(gexp)

topgenes <- names(sort(colSums(gexp), decreasing=TRUE)[1:2000])
gexpsub <- gexp[,topgenes]
gexpsub[1:5,1:5]
dim(gexpsub)

norm <- gexpsub/rowSums(gexpsub) * 10000
norm[1:5,1:5]
norm <- log10(norm + 1)

pcs <- prcomp(norm)
set.seed(1)


pca_df <- data.frame(
  PC1 = pcs$x[,1],
  PC2 = pcs$x[,2],
  X = data[,3], 
  Y = data[,4] 
)

clusters <- kmeans(pca_df[,1:2], centers = 7)$cluster


pca_df$Cluster <- as.factor(clusters)
pca_df_filtered <- pca_df[pca_df$Cluster == "2", ]
cellsOfInterest <- rownames(pca_df)[pca_df$Cluster == "2"]
otherCells <- rownames(pca_df)[pca_df$Cluster != "2"]

p_values <- apply(norm, 2, function(gene) {
  t.test(gene[cellsOfInterest], gene[otherCells])$p.value
})

logFC <- colMeans(norm[cellsOfInterest, ]) - colMeans(norm[otherCells, ])
adjusted_p_values <- p.adjust(p_values, method = "fdr")  # Adjust for multiple testing

volcano_df <- data.frame(
  Gene = colnames(norm),
  LogFC = logFC,
  p_value = p_values,
  neg_log10_p = -log10(adjusted_p_values)
)

volcano_df$Significant <- ifelse(volcano_df$p_value < 0.05 & abs(volcano_df$LogFC) > 1, "Significant", "Not Significant")


library(ggplot2)
library(patchwork)

g1 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size=0.01) +
  labs(title = "All Cells PCA (EEVEE)",
       x = "PC1",
       y = "PC2") +
  theme_minimal()

g2 <- ggplot(pca_df_filtered, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size=0.1, color = 'gold3') +
  labs(title = "Cluster 2 PCA (EEVEE)",
       x = "PC1",
       y = "PC2") +
  theme_minimal() + 
  theme(legend.position = "none")

g3 <- ggplot(pca_df_filtered, aes(x = X, y = Y, color = Cluster)) +
  geom_point(alpha = 0.6, size=0.1, color = 'gold3') +
  labs(title = "Cluster 2 Spatial (EEVEE)",
       x = "X Position",
       y = "Y Position") +
  theme_minimal() + 
  theme(legend.position = "none")

g4 <- ggplot(volcano_df, aes(x = LogFC, y = neg_log10_p, color = Significant)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "gray")) +
  geom_text(data = subset(volcano_df, Significant == "Significant" & neg_log10_p > 2),
            aes(label = Gene), vjust = 1, hjust = 1, size = 0.1) +
  theme_minimal() +
  labs(title = "Gexp Cluster 2 (EEVEE)",
       x = "Log Fold Change (Cluster 2 vs Others)",
       y = "-log10 Adjusted P-Value") +
  theme(plot.title = element_text(hjust = 0.5))

significant_genes <- volcano_df[volcano_df$Significant == "Significant", "Gene"]

top_significant_genes <- head(significant_genes[order(volcano_df$p_value[volcano_df$Significant == "Significant"])], 20)
print(top_significant_genes)

##PIKACHU

file <- '/Users/anishkabhartiya/Desktop/genomic-data-visualization-2025/data/pikachu.csv.gz'
data <- read.csv(file)
data[1:5, 1:10]

area <- data[, 3:4]
pos <- data[, 5:6]
rownames(pos) <- data$barcode
head(pos)
gexp <- data[, 7:ncol(data)]
rownames(gexp) <- data$barcode
gexp[1:5,1:5]
dim(gexp)

norm <- gexp / log10(data$cell_area+1) * 3
norm[1:5, 1:5]

pcs <- prcomp(norm)
set.seed(1)


pca_df <- data.frame(
  PC1 = pcs$x[,1],
  PC2 = pcs$x[,2],
  X = data[,5], 
  Y = data[,6] 
)

clusters <- kmeans(pca_df[,1:2], centers = 7)$cluster


pca_df$Cluster <- as.factor(clusters)
pca_df_filtered <- pca_df[pca_df$Cluster == "3", ]
cellsOfInterest <- rownames(pca_df)[pca_df$Cluster == "3"]
otherCells <- rownames(pca_df)[pca_df$Cluster != "3"]

library(ggplot2)

p_values <- apply(norm, 2, function(gene) {
  t.test(gene[cellsOfInterest], gene[otherCells])$p.value
})

logFC <- colMeans(norm[cellsOfInterest, ]) - colMeans(norm[otherCells, ])

adjusted_p_values <- p.adjust(p_values, method = "fdr")

volcano_df <- data.frame(
  Gene = colnames(norm),
  LogFC = logFC,
  p_value = p_values,
  neg_log10_p = -log10(adjusted_p_values)
)

volcano_df$Significant <- ifelse(volcano_df$p_value < 0.05 & abs(volcano_df$LogFC) > 1, "Significant", "Not Significant")

g5 <- ggplot(volcano_df, aes(x = LogFC, y = neg_log10_p, color = Significant)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "gray")) +
  geom_text(data = subset(volcano_df, Significant == "Significant" & neg_log10_p > 2),
            aes(label = Gene), vjust = 1, hjust = 1, size = 3) +
  theme_minimal() +
  labs(title = "Gexp Cluster 3 (Pikachu)",
       x = "Log Fold Change (Cluster 3 vs Others)",
       y = "-log10 Adjusted P-Value") +
  theme(plot.title = element_text(hjust = 0.5))

g6 <- ggplot(pca_df_filtered, aes(x = X, y = Y, color = Cluster)) +
  geom_point(alpha = 0.6, size=0.01, color = '#7CAE00') +
  labs(title = "Cluster 3 Spatial (Pikachu)",
       x = "X Position",
       y = "Y Position") +
  theme_minimal() + 
  theme(legend.position = "none")

print(g1 + g3 + g4 + g2 + g6 + g5)

19 Feb 2025

« Identifying the same cluster of cells within the Eevee dataset Identifying the Same Cluster of Breast Granular Cells in the Pikachu Dataset »