This file aims to replicate the cell clustering analysis in the Figure 1 of Fuess and Bolnick (2021) using the single-cell RNA sequencing data in Google Drive. Note that the original code and the specific clustering results in Fuess and Bolnick (2021) are not available. Following procedures lead to a different figure than the main Figure 1 in paper, though they may share similar biological interpretations.

The analysis pipeline follows the tutorial for Seurat package.

0. Load dependencies
# to check the installation errors, use ```{r}
library(Seurat)
library(dplyr)
library(patchwork)
library(ggplot2)
library(viridis)

# if you do not install the python package "umap-learn", run
# reticulate::py_install(packages ='umap-learn')

library(openxlsx)
library(plot.matrix)
library(RColorBrewer)

1. Read data

stk.data <- Read10X(data.dir = "./filtered_feature_bc_matrix/")
stk <- CreateSeuratObject(counts = stk.data, project = "stk", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
stk
## An object of class Seurat 
## 16082 features across 123830 samples within 1 assay 
## Active assay: RNA (16082 features, 0 variable features)

2. Pre-processing

Quality control (QC) to select high-quality cells

Following the paper, we discard the cells that with (1) more than 35000 Unique Molecular Identifiers (UMIs, nCount_RNA), (2) fewer than 400 genes (nFeature_RNA), (3) more than 30% mtDNA content (percentage of genes start with mt), and (4) more than 1000 hemoglobin transcripts (percentage of genes start with hb) from the downstream analysis.

Note that I am not sure whether the selections for mtDNA and hemoglobin transcripts are valid for teleosts. I find current selection criteria in the human genetics research.

# mtDNA
stk[["percent.mt"]] <- PercentageFeatureSet(stk, pattern = "^mt-")

# hemoglobin transcript
stk[["percent.hb"]] <- PercentageFeatureSet(stk, pattern = "^hb[^(p)]")

# selection
stk <- subset(stk, subset = nCount_RNA <= 35000 & nFeature_RNA >= 400 & percent.mt <= 30 &  percent.hb <= 100000/dim(stk.data)[1])
stk
## An object of class Seurat 
## 16082 features across 53835 samples within 1 assay 
## Active assay: RNA (16082 features, 0 variable features)

Normalization by log transformation

### Normalization
stk <- NormalizeData(stk, normalization.method = "LogNormalize", scale.factor = 10000)

Selecting high variable genes by dispersion

### Selecting high variable genes by dispersion
stk <- FindVariableFeatures(stk, selection.method = "disp", nfeatures = 4000)
stk
## An object of class Seurat 
## 16082 features across 53835 samples within 1 assay 
## Active assay: RNA (16082 features, 4000 variable features)

Scaling to standardize the expression

### Scaling
all.genes <- rownames(stk)
stk <- ScaleData(stk, features = all.genes)
## Centering and scaling data matrix

3. Linear dimensional reduction: PCA

Run PCA

# run PCA
stk <- RunPCA(stk, features = VariableFeatures(object = stk))
## PC_ 1 
## Positive:  npsn.1, tktb, pgd, fbp2, g6pd, actb1, gpia, lta4h, ENSGACG00000012927, ENSGACG00000000326 
##     taldo1, ENSGACG00000016070, ENSGACG00000019142, ENSGACG00000004882, ENSGACG00000005566, ENSGACG00000014576, pygl, alox5ap, cpa1, ENSGACG00000007836 
##     scinlb, ENSGACG00000001123, ENSGACG00000012951, ncf2, cybb, alox5a, ENSGACG00000000546, mpx, ENSGACG00000017305, rhogb 
## Negative:  rps27a, rps14, rpl23, cd74a, rpl7, ENSGACG00000010801, RPL7A, rpl35a, rpl36a, rpl3 
##     rpl17, rps15a, rps18, rpl13a, si:dkey-159f12.2, rpl31, rplp1, rpl8, rps24, rpl27 
##     rps3a, rpl12, ENSGACG00000000344, rps10, rps12, rps7, rps8a, rps9, ENSGACG00000009280, rpl9 
## PC_ 2 
## Positive:  swap70a, ENSGACG00000004949, ENSGACG00000012781, cd79a, ENSGACG00000015628, ENSGACG00000009519, ncf1, ENSGACG00000000082, thrap3a, dap1b 
##     nrip2, ENSGACG00000000546, ENSGACG00000003531, si:ch1073-406l10.2, id3, ENSGACG00000005291, stmn1b, gadd45ga, ENSGACG00000012578, ENSGACG00000012927 
##     rplp2, stx11a, h2afx, ENSGACG00000012783, fbp2, rpl36a, ENSGACG00000009293, npsn.1, gfi1ab, ENSGACG00000010720 
## Negative:  ENSGACG00000003555, ENSGACG00000018430, ENSGACG00000018429, ENSGACG00000012665, ENSGACG00000003551, tmem176, krt18b, ENSGACG00000010912, ENSGACG00000016267, krt8 
##     ENSGACG00000003403, ENSGACG00000008514, ctsl.1, tcn2, ENSGACG00000017988, ENSGACG00000008819, cxcl19, ctsk, sprb, ENSGACG00000018050 
##     hmox1a, ENSGACG00000020913, cpn1, ENSGACG00000015164, ENSGACG00000003825, ENSGACG00000008828, ENSGACG00000017697, ENSGACG00000018852, cpvl, glud1b 
## PC_ 3 
## Positive:  cdk1, kpna2, hmgb2a, pbk, ube2c.1, ENSGACG00000005590, dut, ENSGACG00000012635, ENSGACG00000008983, smc2 
##     dtymk, birc5a, top2a, smc4, ENSGACG00000004053, h2afva, stmn1b, nusap1, ahcy, PLK1 
##     kif11, ENSGACG00000008411, PCLAF, tubb2b, ENSGACG00000001108, cenpf, ENSGACG00000000860, SNRPD1, kif22, aurka 
## Negative:  mt-co2, zgc:64051, btg1, hbae5, HBE1.1, mt-co1, mt-co3, mt-cyb, ENSGACG00000019282, sftpbb 
##     swap70a, ENSGACG00000010801, tspan36, si:ch211-79k12.1, ENSGACG00000000344, bpifcl, psap, ENSGACG00000013902, ENSGACG00000020076, ENSGACG00000017163 
##     ENSGACG00000015628, ENSGACG00000009519, ENSGACG00000000330, cd74a, irf8, ENSGACG00000018138, gpr137ba, ENSGACG00000019403, ENSGACG00000012578, ENSGACG00000012781 
## PC_ 4 
## Positive:  rps2, anxa5b, rpia, rpl30, ENSGACG00000002182, rpl14, eef1b2, rpsa, rpl34, rplp0 
##     rpl13a, rpl26.1, rpl23, ENSGACG00000009280, rps13, cpa1, rps7, rpl17, rps26, rps25 
##     uba52, capgb, cybb, rpl21, rps3, rps6, rps5, gtpbp4, rpl35, ENSGACG00000002699 
## Negative:  cdk1, ube2c.1, ENSGACG00000005590, PLK1, nusap1, top2a, dut, kpna2, kif11, cenpf 
##     pbk, birc5a, PCLAF, ENSGACG00000012635, smc4, smc2, stmn1b, aurka, ccnb1, tacc3 
##     kif22, hmgb2a, kif20a, ENSGACG00000004053, ENSGACG00000008411, h2afva, tubb2b, ENSGACG00000007861, si:ch1073-406l10.2, ENSGACG00000007536 
## PC_ 5 
## Positive:  swap70a, ENSGACG00000004949, irf8, ncf1, ENSGACG00000009519, zgc:64051, ENSGACG00000011212, bpifcl, psap, ENSGACG00000012781 
##     cd79a, cyba, gpr137ba, ENSGACG00000010643, ENSGACG00000017163, ENSGACG00000012578, cxcr4a, cyb561a3a, sftpbb, grap2a 
##     st3gal4, ENSGACG00000009293, ENSGACG00000001123, ENSGACG00000020076, cfl1l, ENSGACG00000003531, cd40, ncf2, tcirg1b, ENSGACG00000012783 
## Negative:  ENSGACG00000006316, proza, si:dkey-90l8.3, gata1a, ENSGACG00000002393, inka1b, ENSGACG00000016692, ENSGACG00000007536, HEY1, ENSGACG00000005841 
##     rbpms, mt-cyb, mt-nd4l, TPM4, mpl, znf703, ENSGACG00000019128, rhag, ENSGACG00000000724, mt-co3 
##     ENSGACG00000020940, si:ch211-210c8.7, ENSGACG00000019059, egfl7, cd81b, ENSGACG00000006204, dusp2, tmem176l.1, fgl2a, plvapb

Determine the dimensionality

# determine dimension
ElbowPlot(stk)

Checking the variance explained by PCs, we choose the first 15 PCs for following analyses.

d = 15

4. Clustering

Original clustering We find the clustering with 15 PCs to generate a neighborhood graph with using 25-nearest-neighbors. Then, we apply Louvain algorithm (Leiden algorithm leads to many installation issues here) to the neighborhood graph to find the clustering.

stk <- FindNeighbors(stk, dims = 1:d, k.param = 25)
## Computing nearest neighbor graph
## Computing SNN
stk <- FindClusters(stk, resolution = 0.4) # Louvain algorithm
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 53835
## Number of edges: 1935084
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9498
## Number of communities: 23
## Elapsed time: 13 seconds
cluster_stk = Idents(stk)

The algorithm results in 23 cluster groups. Following is the UMAP for the cells labeled by these 23 group assignment.

stk <- RunUMAP(stk, dims = 1:d, umap.method = "umap-learn")
DimPlot(stk, reduction = "umap", label = TRUE)