Identifying the same cluster of cells within the Eevee dataset

Hi this is Ataes.

Identifying the same cluster of cells within the Eevee dataset

1. Write a description explaining why you believe your data visualization is effective using vocabulary terms from Lesson 1.

Applying the same analytical approach from HW3, I identified a corresponding cell type in the HW4 dataset by leveraging dimensionality reduction, clustering, and spatial gene expression analysis. In HW3, I characterized a distinct olive-colored cluster, which exhibited a unique transcriptional profile and spatial organization, strongly suggesting that it represented luminal epithelial cells engaged in growth factor signaling. This conclusion was based on the co-expression of ERBB2 and EGFR, two hallmark epithelial markers, along with CDH1 and FOXA1, further supporting its identity as an epithelial population involved in tissue maintenance or regeneration.

To confirm the presence of this same cell type in HW4, I re-applied k-means clustering, PCA, and t-SNE to the new dataset. The PCA (b) and t-SNE (c) plots in HW4 revealed a cluster with a similar transcriptional identity and structure to the olive cluster from HW3, suggesting a preserved biological population across datasets. Additionally, the physical space visualization (d) in HW4 indicates that this cluster maintains an organized spatial distribution, reinforcing the idea that it is a biologically meaningful group rather than a computational artifact.

Gene expression analysis further validates this finding. The PCA and t-SNE expression plots (e, f) in HW4 show that ERBB2 and EGFR are highly expressed in the same distinct cluster, mirroring their localization in HW3. Moreover, spatial expression plots (g, h) confirm that this population is non-randomly distributed in tissue space, further supporting the hypothesis that it represents an epithelial cell type engaged in active signaling.

Finally, the differential gene expression analysis in HW4 (g) provides strong statistical support for this cluster’s unique transcriptional identity. The volcano plot highlights significantly upregulated genes in the cluster, which may align with the transcriptional signature observed in HW3.

2. Code (paste your code in between the ``` symbols)

# Load Required Libraries
library(Rtsne)
library(ggplot2)
library(patchwork)
library(cluster)

# Load Data
eevee_file <- "~/code/genomic-data-visualization-2025/data/eevee.csv.gz"
data <- read.csv(eevee_file)

# Extract Position & Gene Expression Data
pos <- data[, 3:4]
colnames(pos) <- c("X1", "X2")  # Ensure correct column names
rownames(pos) <- data$X

gexp <- data[, 5:ncol(data)]  # Extract gene expression
rownames(gexp) <- data$X  # Align gene expression with sample names

# Normalize Gene Expression
norm <- gexp / rowSums(gexp) * 10000
topgenes <- names(sort(colSums(norm), decreasing = TRUE)[1:1000])
normsub <- norm[, topgenes]

# Log Transformation
gexp_log <- log10(gexp + 1)

# K-Means Clustering
set.seed(123)
clusters <- kmeans(gexp_log, centers = 5)$cluster
clusters <- as.factor(clusters)
names(clusters) <- rownames(gexp)

# PCA
pcs <- prcomp(gexp_log)
df_pca <- data.frame(pcs$x, clusters)

# t-SNE using top 10 PCs
set.seed(123)
tsne_emb <- Rtsne(pcs$x[, 1:10], dims = 2, perplexity = 50)$Y
df_tsne <- data.frame(tsne_emb, clusters)
colnames(df_tsne) <- c("X1", "X2", "clusters")

# Scree Plot (Variance Explained)
variance_explained <- pcs$sdev^2
total_variance <- sum(variance_explained)
proportion_variance_explained <- variance_explained / total_variance
pc_data <- data.frame(PC_Number = 1:length(proportion_variance_explained),
                      ProportionVariance = proportion_variance_explained)

p_scree <- ggplot(pc_data, aes(x = PC_Number, y = ProportionVariance)) +
  geom_line() +
  geom_point() +
  labs(title = "Scree Plot", x = "Principal Component", y = "Proportion of Variance Explained") +
  theme_classic()

# PCA - Cluster Visualization
p_pca_clusters <- ggplot(df_pca, aes(x = PC1, y = PC2, col = clusters)) +
  geom_point(alpha = 0.7) +
  labs(title = 'PCA: Clusters in Reduced Dimensional Space', x = "PC1", y = "PC2") +
  theme_classic()

# t-SNE - Cluster Visualization
p_tsne_clusters <- ggplot(df_tsne, aes(x = X1, y = X2, col = clusters)) +
  geom_point(alpha = 0.7) +
  labs(title = 't-SNE: Clusters in 2D Space', x = "t-SNE 1", y = "t-SNE 2") +
  theme_classic()

# Physical Space - Cluster Visualization
p_physical_space <- ggplot(data.frame(pos, clusters)) +
  geom_point(aes(x = X1, y = X2, col = clusters)) +
  labs(title = 'Clusters in Physical Space', x = "X", y = "Y") +
  theme_classic()

# Differential Expression for Specific Genes (Ensure Gene Exists)
gene_interest1 <- 'ERBB2'
if (gene_interest1 %in% colnames(gexp)) {
  df_pca$gene1 <- gexp[, gene_interest1]
  
  p_gene_pca <- ggplot(df_pca, aes(x = PC1, y = PC2, col = gene1)) +
    geom_point(alpha = 0.7) +
    labs(title = paste('PCA: Expression of', gene_interest1), x = "PC1", y = "PC2") +
    scale_color_gradient(low = "cyan", high = "purple") +
    theme_classic()
} else {
  print(paste("Warning: Gene", gene_interest1, "not found in dataset."))
  p_gene_pca <- NULL
}

# t-SNE Expression Visualization for EGFR (Ensure Gene Exists)
gene_interest2 <- 'EGFR'
if (gene_interest2 %in% colnames(gexp_log)) {
  df_tsne$gene2 <- gexp_log[, gene_interest2]
  
  p_gene_tsne1 <- ggplot(df_tsne, aes(x = X1, y = X2, col = gene2)) +
    geom_point(alpha = 0.7) +
    labs(title = 't-SNE: Expression of EGFR', x = "t-SNE 1", y = "t-SNE 2") +
    scale_color_gradient(low = "cyan", high = "purple") +
    theme_classic()
} else {
  print(paste("Warning: Gene", gene_interest2, "not found in dataset."))
  p_gene_tsne1 <- NULL
}

# Create Volcano Plot Data
df_volc <- data.frame(pvalues = -log10(filtered_pvalues), log_fc = filtered_logfc)
df_volc$genes <- rownames(df_volc)

# Define thresholds
upper_logfc <- 2
lower_logfc <- -3

df_volc$color <- ifelse(df_volc$log_fc > upper_logfc, "Upregulated",
                        ifelse(df_volc$log_fc < lower_logfc, "Downregulated", "Other"))

# Volcano Plot
p_volcano <- ggplot(df_volc, aes(x = log_fc, y = pvalues, color = color)) + 
  geom_point(size = 0.75) + 
  ggtitle("Differentially Expressed Genes in Cluster 5") + 
  scale_color_manual(values = c("Upregulated" = "cyan", 
                                "Downregulated" = "purple", 
                                "Other" = "lightgray")) +
  labs(color = "DE Gene") +
  xlab("log2(FC)") +
  ylab("-log10(p-value)")


# Combine Plots
combined_plot <- p_scree + p_pca_clusters + p_tsne_clusters + p_physical_space +
  p_gene_pca + p_gene_tsne1 + p_volcano + plot_annotation(tag_levels = 'a') + 
  plot_layout(ncol = 3)

print(combined_plot)

# Identify Top Genes
olive_cluster_cells <- rownames(df_pca[df_pca$clusters == "1", ])
olive_cluster_expression <- colMeans(gexp_log[olive_cluster_cells, ])
other_clusters_expression <- colMeans(gexp_log[-which(rownames(gexp_log) %in% olive_cluster_cells), ])
log2fc <- log2(olive_cluster_expression + 1) - log2(other_clusters_expression + 1)
top_olive_genes <- sort(log2fc, decreasing = TRUE)[1:20]  # Top 20 most overexpressed genes
print("Top Overexpressed Genes in Olive Cluster:")
print(top_olive_genes)

# Identify Most Variable Genes 
var_olive <- apply(gexp_log[olive_cluster_cells, ], 2, var)
other_cells <- setdiff(rownames(gexp_log), olive_cluster_cells)
var_other <- apply(gexp_log[other_cells, ], 2, var)
var_ratio <- var_olive / (var_other + 1e-6)  # Avoid division by zero
top_variable_genes <- names(sort(var_ratio, decreasing = TRUE))[1:20]  # Top 20 most variable genes
print("Top Most Variable Genes in Olive Cluster:")
print(top_variable_genes)

19 Feb 2025

« Identifying Epithelial cells in both datasets Hw4: Finding the same cell cluster in the other dataset »