This file aims to replicate the cell clustering analysis in the Figure 1 of Fuess and Bolnick (2021) using the single-cell RNA sequencing data in Google Drive. Note that the original code and the specific clustering results in Fuess and Bolnick (2021) are not available. Following procedures lead to a different figure than the main Figure 1 in paper, though they may share similar biological interpretations.

The analysis pipeline follows the tutorial for Seurat package.

0. Load dependencies
# to check the installation errors, use ```{r}
library(Seurat)
library(dplyr)
library(patchwork)
library(ggplot2)
library(viridis)

# if you do not install the python package "umap-learn", run
# reticulate::py_install(packages ='umap-learn')

library(openxlsx)
library(plot.matrix)
library(RColorBrewer)

1. Read data

stk.data <- Read10X(data.dir = "./filtered_feature_bc_matrix/")
stk <- CreateSeuratObject(counts = stk.data, project = "stk", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
stk
## An object of class Seurat 
## 16082 features across 123830 samples within 1 assay 
## Active assay: RNA (16082 features, 0 variable features)

2. Pre-processing

Quality control (QC) to select high-quality cells

Following the paper, we discard the cells that with (1) more than 35000 Unique Molecular Identifiers (UMIs, nCount_RNA), (2) fewer than 400 genes (nFeature_RNA), (3) more than 30% mtDNA content (percentage of genes start with mt), and (4) more than 1000 hemoglobin transcripts (percentage of genes start with hb) from the downstream analysis.

Note that I am not sure whether the selections for mtDNA and hemoglobin transcripts are valid for teleosts. I find current selection criteria in the human genetics research.

# mtDNA
stk[["percent.mt"]] <- PercentageFeatureSet(stk, pattern = "^mt-")

# hemoglobin transcript
stk[["percent.hb"]] <- PercentageFeatureSet(stk, pattern = "^hb[^(p)]")

# selection
stk <- subset(stk, subset = nCount_RNA <= 35000 & nFeature_RNA >= 400 & percent.mt <= 30 &  percent.hb <= 100000/dim(stk.data)[1])
stk
## An object of class Seurat 
## 16082 features across 53835 samples within 1 assay 
## Active assay: RNA (16082 features, 0 variable features)

Normalization by log transformation

### Normalization
stk <- NormalizeData(stk, normalization.method = "LogNormalize", scale.factor = 10000)

Selecting high variable genes by dispersion

### Selecting high variable genes by dispersion
stk <- FindVariableFeatures(stk, selection.method = "disp", nfeatures = 4000)
stk
## An object of class Seurat 
## 16082 features across 53835 samples within 1 assay 
## Active assay: RNA (16082 features, 4000 variable features)

Scaling to standardize the expression

### Scaling
all.genes <- rownames(stk)
stk <- ScaleData(stk, features = all.genes)
## Centering and scaling data matrix

3. Linear dimensional reduction: PCA

Run PCA

# run PCA
stk <- RunPCA(stk, features = VariableFeatures(object = stk))
## PC_ 1 
## Positive:  npsn.1, tktb, pgd, fbp2, g6pd, actb1, gpia, lta4h, ENSGACG00000012927, ENSGACG00000000326 
##     taldo1, ENSGACG00000016070, ENSGACG00000019142, ENSGACG00000004882, ENSGACG00000005566, ENSGACG00000014576, pygl, alox5ap, cpa1, ENSGACG00000007836 
##     scinlb, ENSGACG00000001123, ENSGACG00000012951, ncf2, cybb, alox5a, ENSGACG00000000546, mpx, ENSGACG00000017305, rhogb 
## Negative:  rps27a, rps14, rpl23, cd74a, rpl7, ENSGACG00000010801, RPL7A, rpl35a, rpl36a, rpl3 
##     rpl17, rps15a, rps18, rpl13a, si:dkey-159f12.2, rpl31, rplp1, rpl8, rps24, rpl27 
##     rps3a, rpl12, ENSGACG00000000344, rps10, rps12, rps7, rps8a, rps9, ENSGACG00000009280, rpl9 
## PC_ 2 
## Positive:  swap70a, ENSGACG00000004949, ENSGACG00000012781, cd79a, ENSGACG00000015628, ENSGACG00000009519, ncf1, ENSGACG00000000082, thrap3a, dap1b 
##     nrip2, ENSGACG00000000546, ENSGACG00000003531, si:ch1073-406l10.2, id3, ENSGACG00000005291, stmn1b, gadd45ga, ENSGACG00000012578, ENSGACG00000012927 
##     rplp2, stx11a, h2afx, ENSGACG00000012783, fbp2, rpl36a, ENSGACG00000009293, npsn.1, gfi1ab, ENSGACG00000010720 
## Negative:  ENSGACG00000003555, ENSGACG00000018430, ENSGACG00000018429, ENSGACG00000012665, ENSGACG00000003551, tmem176, krt18b, ENSGACG00000010912, ENSGACG00000016267, krt8 
##     ENSGACG00000003403, ENSGACG00000008514, ctsl.1, tcn2, ENSGACG00000017988, ENSGACG00000008819, cxcl19, ctsk, sprb, ENSGACG00000018050 
##     hmox1a, ENSGACG00000020913, cpn1, ENSGACG00000015164, ENSGACG00000003825, ENSGACG00000008828, ENSGACG00000017697, ENSGACG00000018852, cpvl, glud1b 
## PC_ 3 
## Positive:  cdk1, kpna2, hmgb2a, pbk, ube2c.1, ENSGACG00000005590, dut, ENSGACG00000012635, ENSGACG00000008983, smc2 
##     dtymk, birc5a, top2a, smc4, ENSGACG00000004053, h2afva, stmn1b, nusap1, ahcy, PLK1 
##     kif11, ENSGACG00000008411, PCLAF, tubb2b, ENSGACG00000001108, cenpf, ENSGACG00000000860, SNRPD1, kif22, aurka 
## Negative:  mt-co2, zgc:64051, btg1, hbae5, HBE1.1, mt-co1, mt-co3, mt-cyb, ENSGACG00000019282, sftpbb 
##     swap70a, ENSGACG00000010801, tspan36, si:ch211-79k12.1, ENSGACG00000000344, bpifcl, psap, ENSGACG00000013902, ENSGACG00000020076, ENSGACG00000017163 
##     ENSGACG00000015628, ENSGACG00000009519, ENSGACG00000000330, cd74a, irf8, ENSGACG00000018138, gpr137ba, ENSGACG00000019403, ENSGACG00000012578, ENSGACG00000012781 
## PC_ 4 
## Positive:  rps2, anxa5b, rpia, rpl30, ENSGACG00000002182, rpl14, eef1b2, rpsa, rpl34, rplp0 
##     rpl13a, rpl26.1, rpl23, ENSGACG00000009280, rps13, cpa1, rps7, rpl17, rps26, rps25 
##     uba52, capgb, cybb, rpl21, rps3, rps6, rps5, gtpbp4, rpl35, ENSGACG00000002699 
## Negative:  cdk1, ube2c.1, ENSGACG00000005590, PLK1, nusap1, top2a, dut, kpna2, kif11, cenpf 
##     pbk, birc5a, PCLAF, ENSGACG00000012635, smc4, smc2, stmn1b, aurka, ccnb1, tacc3 
##     kif22, hmgb2a, kif20a, ENSGACG00000004053, ENSGACG00000008411, h2afva, tubb2b, ENSGACG00000007861, si:ch1073-406l10.2, ENSGACG00000007536 
## PC_ 5 
## Positive:  swap70a, ENSGACG00000004949, irf8, ncf1, ENSGACG00000009519, zgc:64051, ENSGACG00000011212, bpifcl, psap, ENSGACG00000012781 
##     cd79a, cyba, gpr137ba, ENSGACG00000010643, ENSGACG00000017163, ENSGACG00000012578, cxcr4a, cyb561a3a, sftpbb, grap2a 
##     st3gal4, ENSGACG00000009293, ENSGACG00000001123, ENSGACG00000020076, cfl1l, ENSGACG00000003531, cd40, ncf2, tcirg1b, ENSGACG00000012783 
## Negative:  ENSGACG00000006316, proza, si:dkey-90l8.3, gata1a, ENSGACG00000002393, inka1b, ENSGACG00000016692, ENSGACG00000007536, HEY1, ENSGACG00000005841 
##     rbpms, mt-cyb, mt-nd4l, TPM4, mpl, znf703, ENSGACG00000019128, rhag, ENSGACG00000000724, mt-co3 
##     ENSGACG00000020940, si:ch211-210c8.7, ENSGACG00000019059, egfl7, cd81b, ENSGACG00000006204, dusp2, tmem176l.1, fgl2a, plvapb

Determine the dimensionality

# determine dimension
ElbowPlot(stk)

Checking the variance explained by PCs, we choose the first 15 PCs for following analyses.

d = 15

4. Clustering

Original clustering We find the clustering with 15 PCs to generate a neighborhood graph with using 25-nearest-neighbors. Then, we apply Louvain algorithm (Leiden algorithm leads to many installation issues here) to the neighborhood graph to find the clustering.

stk <- FindNeighbors(stk, dims = 1:d, k.param = 25)
## Computing nearest neighbor graph
## Computing SNN
stk <- FindClusters(stk, resolution = 0.4) # Louvain algorithm
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 53835
## Number of edges: 1935084
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9498
## Number of communities: 23
## Elapsed time: 13 seconds
cluster_stk = Idents(stk)

The algorithm results in 23 cluster groups. Following is the UMAP for the cells labeled by these 23 group assignment.

stk <- RunUMAP(stk, dims = 1:d, umap.method = "umap-learn")
DimPlot(stk, reduction = "umap", label = TRUE)

Condensed clustering

We then condense the 23 clusters to 8 clusters as Fuess and Bolnick (2021). We compare the top marker genes in our 23 clusters versus the top marker genes in Lauren’s 8 clusters (Supplement Table 1).

# find distinguishing genes for 23 clusters
stk.markers <- FindAllMarkers(stk, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
top_genes = stk.markers %>%
    group_by(cluster) %>%
    slice_max(n = 10, order_by = avg_log2FC)

# compared with Lauren's top genes
top_genes_L = read.xlsx("top_gene_L.xlsx")

top_genes$gene = gsub("(.)\\..", replacement = "\\1" ,top_genes$gene)
top_genes$gene = gsub("(ENSGACG\\d{10})\\d", replacement = "\\1" ,top_genes$gene)

cluster_name = top_genes_L$cluster
gene_count = c()
for (i in 1:length(cluster_name)) {

  reference = unique(strsplit(top_genes_L[i,2], "; ")[[1]])
  top_count = c()
  for (j in 1:length(levels(top_genes$cluster))) {
    top_count = c(top_count, sum(top_genes[top_genes$cluster == levels(top_genes$cluster)[j],]$gene %in% reference))
  }
  
  gene_count = rbind(gene_count, top_count)
}
rownames(gene_count) = cluster_name
colnames(gene_count) = 0:22

gene_count[which(gene_count == 0, arr.ind = T)] = NA

plot(gene_count, col = brewer.pal(9, "Blues"),axis.row = list(side=2, las=1, cex.axis=0.7),
     xlab="clusters", ylab=" ", main = "Top genes counts")

We assign the 23 clusters to 8 clusters according to the top gene counts.

neu = c(0,1,7, 14, 3, 6, 8)
apc = c(4, 15, 16)
bcell = c(2, 5,13)
pl = c(21)
fi = c(22)
nkc = c(20)
ery = c(18,19)
hc = c(9, 10, 11,12, 17)

id = as.character(0:22)
new.cluster.ids <- rep(NA, length(id))
new.cluster.ids[id %in% neu] = "Neutrophils"
new.cluster.ids[id %in% apc] = "APC"
new.cluster.ids[id %in% bcell] = "B_cells"
new.cluster.ids[id %in% pl] = "Platelets"
new.cluster.ids[id %in% fi] = "Fibroblasts"
new.cluster.ids[id %in% nkc] = "NKC"
new.cluster.ids[id %in% ery] = "Erythrocytes"
new.cluster.ids[id %in% hc] = "HC"
names(new.cluster.ids) = levels(stk)

stk <- RenameIdents(stk, new.cluster.ids)
DimPlot(stk, reduction = "umap", label = TRUE)

Last, we plot the gene-cell expression heatmap with top 5 genes in each cluster and 50 randomly chosen cells in each cell type based on the condensed clustering result.

# find distinguishing genes
stk.markers <- FindAllMarkers(stk, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster Neutrophils
## Calculating cluster B_cells
## Calculating cluster APC
## Calculating cluster HC
## Calculating cluster Erythrocytes
## Calculating cluster NKC
## Calculating cluster Platelets
## Calculating cluster Fibroblasts
# top genes
stk.markers %>%
  group_by(cluster) %>%
  top_n(n = 5, wt = avg_log2FC) -> top5
# random select 50 cells for each type
cluster_stk = data.frame(cell =  colnames(stk[["RNA"]]), cluster = Idents(stk))
cluster_stk %>%
  group_by(cluster) %>%
  slice_sample(n = 50) -> random_cell

# heatmap
DoHeatmap(stk, features = top5$gene, cells = random_cell$cell, raster = FALSE) + scale_fill_viridis()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

top5[,c(6,7)]
## # A tibble: 40 × 2
## # Groups:   cluster [8]
##    cluster     gene              
##    <fct>       <chr>             
##  1 Neutrophils npsn.1            
##  2 Neutrophils ENSGACG00000012927
##  3 Neutrophils ENSGACG00000005566
##  4 Neutrophils fbp2              
##  5 Neutrophils ENSGACG00000004882
##  6 B_cells     ENSGACG00000012769
##  7 B_cells     swap70a           
##  8 B_cells     ENSGACG00000010720
##  9 B_cells     ENSGACG00000009519
## 10 B_cells     ENSGACG00000012781
## # … with 30 more rows