HW 3

Hi! My name is Aarna and I am a junior studying BME. Really excited for the class!

Figure Description and Interpretation

This figure integrates dimensionality reduction, spatial mapping, and differential expression analysis to characterize an Aqp2-positive cell population. In PCA space, cells form distinct clusters, with Cluster 3 separating from other groups and overlapping strongly with Aqp2-positive cells. Highlighting Aqp2-expressing cells shows that they occupy a specific region of PCA space. This indicates a transcriptionally distinct population. In physical tissue space, Aqp2 expression is spatially forming a pattern that closely matches the spatial distribution of Cluster 3. This suggests that this cluster represents a coherent anatomical and functional unit. Differential expression analysis further supports this interpretation. The top marker genes for Cluster 3 include Aqp2, Aqp6, Slc4a1, Slc26a4, Atp6v1c2, and Foxi1, which are well-established markers of renal collecting duct intercalated cells (https://esbl.nhlbi.nih.gov/Databases/MouseWK/WKMarkers.html ). In particular, Aqp2 is a canonical marker of principal cells in the collecting duct, while Slc4a1, Atp6v1c2, and Foxi1 are associated with acid-base regulation in intercalated cells. The co-expression of these transporters and proton pump components indicates that Cluster 3 corresponds to a specialized epithelial population involved in water and ion homeostasis. Previous studies have shown that Aqp2 is specifically expressed in collecting duct principal cells (https://pubmed.ncbi.nlm.nih.gov/7523157/) and that Foxi1 and Atp6v1 subunits define intercalated cells in the kidney (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3012266/). The presence of these markers, combined with the cluster’s distinct position in PCA space and its spatial localization, supports the conclusion that Cluster 3 represents a collecting duct cell population, likely enriched for Aqp2-positive principal cells with associated intercalated cell features. Together, the clustering, spatial distribution, and marker gene expression provide consistent evidence that Cluster 3 corresponds to an Aqp2-positive renal epithelial cell type rather than merely a technical artifact.

Code (paste your code in between the ``` symbols)

data <- read.csv("~/Desktop/Xenium-IRI-ShamR_matrix.csv.gz")
library(ggplot2)
library(dplyr)
library(stats)
library(gridExtra)

pos <- data[, c("x", "y")]
rownames(pos) <- data$X
gene_start_col <- which(colnames(data) == "y") + 1
gexp <- data[, gene_start_col:ncol(data)]
rownames(gexp) <- data$X
loggexp <- log10(gexp + 1)

# Scale 
scaled_gexp <- scale(loggexp)

# PCA
pca <- prcomp(scaled_gexp)
df_pca <- data.frame(pca$x)
df_pca$cell_id <- rownames(gexp)

# K-means clustering
set.seed(42)
kmeans_res <- kmeans(scaled_gexp, centers = 10)
clusters <- as.factor(kmeans_res$cluster)
names(clusters) <- rownames(gexp)
df_pca$cluster <- clusters

#help from AI for this part: from the given dataset and code, get the aqp2 expression and find which cluster expresses most aqp2
# ============================================================================
# FOCUS ON AQP2 (Collecting Duct Principal Cells)
# ============================================================================

# Get Aqp2 expression
aqp2_exp <- gexp[, "Aqp2"]
aqp2_log_exp <- loggexp[, "Aqp2"]

# Find which cluster(s) express Aqp2 most
cluster_aqp2 <- data.frame(
  cluster = 1:10,
  mean_exp = tapply(aqp2_exp, clusters, mean),
  pct_positive = tapply(aqp2_exp > 0, clusters, mean) * 100,
  n_cells = table(clusters))


# Identify the cluster with highest Aqp2 expression
aqp2_cluster <- which.max(cluster_aqp2$mean_exp)
cat(sprintf("\nCluster %d has highest Aqp2 expression\n", aqp2_cluster))

# VISUALIZATION

df_pca$aqp2_exp <- aqp2_log_exp
pos$cluster <- clusters
pos$aqp2_exp <- aqp2_log_exp
aqp2_threshold <- 0.5
df_pca$aqp2_positive <- aqp2_log_exp > aqp2_threshold
pos$aqp2_positive <- aqp2_log_exp > aqp2_threshold

# Plot 1: PCA with clusters
p1 <- ggplot(df_pca, aes(x = PC1, y = PC2, col = cluster)) + 
  geom_point(size = 0.5, alpha = 0.6) + ggtitle("PCA Clustering of Cells") + theme_minimal() + theme(legend.position = "right")


# Plot 2: Aqp2-positive cells in PCA space
p2 <- ggplot(df_pca, aes(x = PC1, y = PC2, col = aqp2_positive)) + geom_point(size = 0.5, alpha = 0.6) + scale_color_manual(values = c("gray80", "red"), labels = c("Aqp2-negative", "Aqp2-positive"),name = "Cell Type") + ggtitle("Aqp2+ Cells in PCA Space") +  theme_minimal()

# Plot 3: aqp2 expression in physical space (continuous)
p3 <- ggplot(pos, aes(x = x, y = y, col = aqp2_exp)) + 
  geom_point(size = 0.3, alpha = 0.3) + 
  scale_color_gradient2(low = "navy", mid = "lightblue", high = "red", midpoint = quantile(aqp2_log_exp, 0.75), name = "log10(Aqp2+1)") +
  ggtitle("Aqp2 Expression in Physical Space") + theme_minimal() + coord_fixed()

plot_data <- data.frame(x = pos$x,y = pos$y, cluster = clusters, is_cluster3 = clusters == 3)

p5 <- ggplot(plot_data, aes(x = x, y = y, col = is_cluster3)) + 
  geom_point(size = 0.3, alpha = 0.3) + 
  scale_color_manual(values = c("gray80", "red"),labels = c("Other clusters", "Cluster 3"), name = "") +
  ggtitle("Cluster 3 in Physical Space") + theme_minimal() + coord_fixed() + theme(legend.position = "right")

#last plot with top 10 DE genes in cluster 3

#help from AI in this part: 
#write code that identifies the cluster of cells with the highest aqp2 expression group. then compare gene expression in this cluster to all other cells using a wilcoxon statistical test to find genes that are significantly more highly expressed in Aqp2+ cells. for each gene, calculate average expression, the percentage of cells expressing it, and fold change between groups. then ranks the top 10 genes
# Find Aqp2+ cluster
aqp2_cluster <- which.max(cluster_aqp2$mean_exp)
cellsOfInterest <- names(clusters)[clusters == aqp2_cluster]
otherCells <- names(clusters)[clusters != aqp2_cluster]

# Run DE tests
p_values <- sapply(colnames(gexp), function(g)
  tryCatch(wilcox.test(gexp[cellsOfInterest, g],
                       gexp[otherCells, g],
                       alternative = "greater")$p.value,
           error = function(e) 1)
)

# Build results table
p_values_df <- data.frame(
  gene = names(p_values),
  p_value = p_values,
  mean_in = colMeans(gexp[cellsOfInterest, ]),
  mean_out = colMeans(gexp[otherCells, ]),
  pct_in = colMeans(gexp[cellsOfInterest, ] > 0) * 100,
  pct_out = colMeans(gexp[otherCells, ] > 0) * 100
) %>%
  mutate(adj_p = p.adjust(p_value, "fdr"),
         fc = (mean_in + 0.01)/(mean_out + 0.01),
         log2_fc = log2(fc)) %>%
  arrange(desc(log2_fc))

# Top 10
plot_data <- head(p_values_df, 10)

#Plot
p4 <- ggplot(plot_data,
             aes(log2_fc, reorder(gene, log2_fc))) +
  geom_col(fill = "steelblue") +
  labs(x = "log2 Fold Change",
       y = "Gene",
       title = paste("Top 10 DE Genes in Cluster", aqp2_cluster)) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10, face = "bold"),
        plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

# Arrange all plots
grid.arrange(p1, p2, p3, p5, p4, ncol = 2)

10 Feb 2026

« HW2 HW3 »