Analyzing MMP11 Gene Expression

I'm a junior at Johns Hopkins.

Visualization Summary In this visualization, I am analyzing the Eevee sequencing spatial transcriptomics dataset. The 1000 most highly expressed genes were normalized, log-transformed, and clustered (K = 10). To understand gene expression similarity profiles and reduce the dimensionality of the dataset, principal component analysis was performed (A). Cluster 4 was chosen and was spatially analyzed (B). Next, differential gene expression analysis was performed to display the statistical significant of changes in gene expression between cluster 4 and other clusters which ultimately highlights genes with large expression differences (C). The top 20 genes in cluster 4 were then displayed, and MMP11 was chosen as the gene of interest due to its expression level (D). t-distributed Stochastic Neighbor Embedding (t-SNE) analysis was then performed on MMP11 gene expression to better visualize its expression across cells (E), and spatial analysis was also performed (F).

Cluster 4 has biological meaning since many of the genes expressed are related to cell growth and migration and cancer progression. Specifically, MMP11 has been previously associated with breast cancer cell migration [1]. IGFBP7 has also been previously shown to have both pro- and anti-angiogenic properties in different tissue remodeling states which can influence tumor growth [2]. As such, I am interpreting my cell cluster as cancer-causing or full-blown cancer cells. Although, I would need to confirm this theory with the content of the other clusters to get a full understanding.

References: [1] Zhuang Y, Li X, Zhan P, Pi G, Wen G. MMP11 promotes the proliferation and progression of breast cancer through stabilizing Smad2 protein. Oncol Rep. 2021 Apr;45(4):16. doi: 10.3892/or.2021.7967. Epub 2021 Mar 2. PMID: 33649832; PMCID: PMC7876999. [2] Lit KK, Zhirenova Z, Blocki A. Insulin-like growth factor-binding protein 7 (IGFBP7): A microenvironment-dependent regulator of angiogenesis and vascular remodeling. Front Cell Dev Biol. 2024 Jul 9;12:1421438. doi: 10.3389/fcell.2024.1421438. PMID: 39045455; PMCID: PMC11263173.

Code (paste your code in between the ``` symbols)

library(ggplot2)
library(dplyr)
library(tidyr)
library(RColorBrewer)

file <- 'eevee.csv.gz'
data <- read.csv(file)
data[1:5,1:10]

## K Means Clustering + Principal Component Analysis
pos <- data[, 5:6]
rownames(pos) <- data$cell_id
gexp <- data[, 7:ncol(data)]
rownames(gexp) <- data$barcode
topgenes <- names(sort(colSums(gexp), decreasing=TRUE)[1:1000])
gene_data <- gexp[,topgenes]

gene_norm <- log10(gene_data * 10000 / rowSums(gene_data) + 1)

# K Means Clustering 
com <- kmeans(gene_norm, centers=10)
clusters <- com$cluster
clusters <- as.factor(clusters)
names(clusters) <- rownames(gene_norm)
head(clusters)

pca_result <- prcomp(gene_norm, scale. = TRUE)
summary(pca_result)

df <- data.frame(pca_result$x, clusters)
p1 <- ggplot(df, aes(x = PC1, y = PC2, col = clusters)) +
  geom_point() +
  labs(title = "PCA of Eevee Cell Clusters")

## Spatial Analysis of Cluster 4

spatial_df <- data.frame(
  cell_id = rownames(gene_norm),
  cluster = clusters,
  aligned_x = data$aligned_x,
  aligned_y = data$aligned_y
)

cluster4_spatial <- spatial_df[spatial_df$cluster == 4, ]

p2 <- ggplot(spatial_df, aes(x = aligned_x, y = aligned_y, color = cluster == 4)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("gray", "red"), 
                     labels = c("Other Clusters", "Cluster 4")) +
  labs(title = "Spatial Distribution of Cells",
       x = "Aligned X",
       y = "Aligned Y") +
  theme_minimal() +
  theme(legend.title = element_blank())

## Differential Gene Expression for Cluster 4

# Create a binary vector for cluster 4 vs others
cluster4_vs_others <- ifelse(clusters == 4, 1, 0)

# Function to perform t-test for each gene
perform_t_test <- function(gene) {
  test_result <- t.test(gene_norm[cluster4_vs_others == 1, gene], 
                        gene_norm[cluster4_vs_others == 0, gene])
  return(c(p_value = test_result$p.value, 
           t_statistic = test_result$statistic))
}

# Perform t-test for all genes
t_test_results <- t(sapply(colnames(gene_norm), perform_t_test))

# Calculate log fold change
log_fc <- sapply(colnames(gene_norm), function(gene) {
  mean(gene_norm[cluster4_vs_others == 1, gene]) - 
    mean(gene_norm[cluster4_vs_others == 0, gene])
})

# Combine results
results <- data.frame(
  gene = colnames(gene_norm),
  log_fc = log_fc,
  p_value = t_test_results[, "p_value"],
  t_statistic = t_test_results[, "t_statistic.t"]
)

# Adjust p-values for multiple testing
results$adj_p_value <- p.adjust(results$p_value, method = "BH")

# Sort by adjusted p-value
results <- results[order(results$adj_p_value), ]

# Add significance column
results$significant <- ifelse(results$adj_p_value < 0.05, "Yes", "No")
top_genes <- head(results, 20)

p4 <- ggplot(top_genes, aes(x = reorder(gene, log_fc), y = log_fc, fill = significant)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("No" = "grey", "Yes" = "red")) +
  coord_flip() +
  labs(title = "Cluster 4: Top 20 Differentially Expressed Genes",
       x = "Gene",
       y = "Log Fold Change") +
  theme_minimal()

# Create a volcano plot
p3 <- ggplot(results, aes(x = log_fc, y = -log10(adj_p_value), color = significant)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("No" = "grey", "Yes" = "red")) +
  labs(title = "Cluster 4: Volcano Plot of Differential Expression",
       x = "Log Fold Change",
       y = "-log10(Adjusted P-value)") +
  theme_minimal()

# Print top 10 upregulated and downregulated genes
cat("Top 10 upregulated genes in cluster 4:\n")
print(head(results[results$log_fc > 0, ], 10))

cat("\nTop 10 downregulated genes in cluster 4:\n")
print(head(results[results$log_fc < 0, ], 10))

## Visualize MMP11 using tSNE

mmp11_expression <- gene_norm[, "MMP11"]
set.seed(42)
tsne_result <- Rtsne(gene_norm, dims = 2, perplexity = 30, verbose = TRUE, max_iter = 1000)

# Combine t-SNE results with MMP11 expression data
tsne_df <- data.frame(
  tSNE1 = tsne_result$Y[, 1],
  tSNE2 = tsne_result$Y[, 2],
  MMP11 = mmp11_expression
)

# Plot the t-SNE results, coloring by MMP11 expression
p5 <- ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = MMP11)) +
  geom_point(size = 2) +
  scale_color_gradient(low = "grey", high = "red") +
  labs(
    title = "t-SNE Visualization of MMP11 Expression",
    x = "t-SNE1",
    y = "t-SNE2",
    color = "MMP11 Expression"
  ) +
  theme_minimal()


## MMP11
# Extract MMP11 expression and combine with spatial coordinates
spatial_plot_df <- data.frame(
  aligned_x = data$aligned_x,
  aligned_y = data$aligned_y,
  MMP11 = gene_norm[, "MMP11"]
)

# Plot MMP11 expression in physical space
p6 <- ggplot(spatial_plot_df, aes(x = aligned_x, y = aligned_y, color = MMP11)) +
  geom_point(size = 2) +
  scale_color_gradient(low = "grey", high = "red") +
  labs(
    title = "Spatial Distribution of MMP11 Expression",
    x = "Aligned X",
    y = "Aligned Y",
    color = "MMP11 Expression"
  ) +
  theme_minimal()

## Make combined plot
combined_plot <- (p1 + p2 + p3 + p4 + p5 + p6) + plot_layout(ncol = 3) +
  plot_annotation(tag_levels = 'A')

print(combined_plot)

ggsave("hw3_sraghav9.png", combined_plot, width = 18, height = 6, units = "in", dpi = 300)

12 Feb 2025

« Interrogating Spatial Spot Cluster Differential Gene Expression with 10x Visium Visualization of potential B cell populations in the Eevee sequencing data »