Cellranger使用教程

建库,人和小鼠的数据库可以直接下载,对于无法直接下载的需要自行下载全基因组序列和gtf文件,根据 cellranger mkref构建参考数据库

wget ftp://ftp.ensembl.org/pub/release-97/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna.primary_assembly.fa.gz
gunzip Danio_rerio.GRCz11.dna.primary_assembly.fa.gz
wget ftp://ftp.ensembl.org/pub/release-97/gtf/danio_rerio/Danio_rerio.GRCz11.97.gtf.gz
gunzip Danio_rerio.GRCz11.97.gtf.gz
cellranger mkgtf Danio_rerio.GRCz11.97.gtf Danio_rerio.GRCz11.97.filtered.gtf                  
                 --attribute=gene_biotype:protein_coding \
                 --attribute=gene_biotype:lincRNA \
                 --attribute=gene_biotype:antisense \
                 --attribute=gene_biotype:IG_LV_gene \
                 --attribute=gene_biotype:IG_V_gene \
                 --attribute=gene_biotype:IG_V_pseudogene \
                 --attribute=gene_biotype:IG_D_gene \
                 --attribute=gene_biotype:IG_J_gene \
                 --attribute=gene_biotype:IG_J_pseudogene \
                 --attribute=gene_biotype:IG_C_gene \
                 --attribute=gene_biotype:IG_C_pseudogene \
                 --attribute=gene_biotype:TR_V_gene \
                 --attribute=gene_biotype:TR_V_pseudogene \
                 --attribute=gene_biotype:TR_D_gene \
                 --attribute=gene_biotype:TR_J_gene \
                 --attribute=gene_biotype:TR_J_pseudogene \
                 --attribute=gene_biotype:TR_C_gene
cellranger mkref --nthreads=80 --genome=ref_zebr_GRCz11 --fasta=Danio_rerio.GRCz11.dna.primary_assembly.fa --genes=Danio_rerio.GRCz11.97.filtered.gtf --ref-version=3.1.0    #注意gene_id和transcript_id的顺序

计算表达量,下机的原始数据可以通过bcl2fastq拆分,也可通过cellranger自带的mkfastq命令拆分。bcl2fastq拆分后得到两个fastq文件,index信息包含在第一个fastq文件第一行尾部;mkfastq拆分后得到三个fastq文件,index信息包含在一个单独的fastq文件里。两种拆分结果都可作为count命令的输入,但文件的命名一定要严格安装软件的说明,否则会出错。

cellranger count --id=PBLzebr1 --transcriptome=/home/wuchangsong/sc_cell/ref_zebr_GRCz11 --fastqs=/home/wuchangsong/sc_cell/fastq/ --sample=PBLzebr1

得到的表达矩阵在filtered_feature_bc_matrix文件夹中,在analysis文件夹中有PCA和tSNE聚类结果文本形式。cellranger count 计算的结果只能作为错略观测的结果,如果需要进一步分析聚类细胞,还需要进行下游分析,这里使用官方推荐 R 包(Seurat),后边的分析参考Seurat的使用。

参考:https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#GRCh38mm10_3.1.0

Seurat使用流程

seurat软件安装

Depends R (>= 3.4.0), methods

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("Seurat")

CentOS系统安装时要注意gcc的版本

setwd("D:/Experiment_data/zxj/outs")
library(Seurat)
pbl.data <- Read10X(data.dir = "D:/Experiment_data/zxj/outs/filtered_feature_bc_matrix")
dim(pbl.data) #查看行和列
#创建 Seurat 对象与数据过滤。保留在>=3 个细胞中表达的基因;保留能检测到>=200 个基因的细胞。
pbl <- CreateSeuratObject(counts = pbl.data, project = "pbl1907", min.cells = 3, min.features = 200) 
#mt-开头的为线粒体基因,这里将其进行标记并统计其分布频率
pbl[["percent.mt"]] <- PercentageFeatureSet(pbl, pattern = "^mt-")
# 对 pbmc 对象做小提琴图,分别为基因数,细胞数和线粒体占比
VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
#根据图片中基因数和线粒体数,分别设置过滤参数,这里基因数 200-2500,线粒体百分比为小于 5%
pbl <- subset(pbl, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")+ NoLegend()
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")+ NoLegend()
CombinePlots(plots = list(plot1, plot2))
#标准化
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#鉴定高变基因
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
plot3 <- VariableFeaturePlot(pbmc)+ NoLegend()
plot4 <- LabelPoints(plot = plot3, points = top10, repel = TRUE,xnudge=0,ynudge=0)
CombinePlots(plots = list(plot1, plot2))
#数据归一化,对所有基因进行标准化,默认只是标准化高变基因( 2000 个),速度更快,不影响 PCA 和分群,但影响热图的绘制。
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc,features = all.genes,vars.to.regress = "percent.mt")
#线性降维(PCA),默认用高变基因集,但也可通过 features 参数自己指定
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
#定义可视化细胞和功能的几种有用的方式PCA,包括VizDimReduction,DimPlot,和DimHeatmap
VizDimLoadings(object = pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")+ NoLegend()
DimHeatmap(pbmc, dims = 1:2, cells = 500, balanced = TRUE)
#鉴定数据集的可用维度,主成分分析结束后需要确定哪些主成分所代表的基因可以进入下游分析,这里可以使用JackStraw做重抽样分析。可以用JackStrawPlot可视化看看哪些主成分可以进行下游分析。
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)#虚线以上的为可用维度,你也可以调整 dims 参数,画出所有 pca 查看
#肘部图(碎石图),基于每个主成分对方差解释率的排名。建议尝试选择多个主成分个数做下游分析,对整体影响不大;在选择此参数时,建议选择偏高的数字( “宁滥勿缺”,为了获取更多的稀有分群);有些亚群很罕见,如果没有先验知识,很难将这种大小的数据集与背景噪声区分开来。
ElbowPlot(object = pbmc)
#非线性降维( UMAP/tSNE)
#基于 PCA 空间中的欧氏距离计算 nearest neighbor graph,优化任意两个细胞间的距离权重(输入上一步得到的 PC 维数)
pbmc <- FindNeighbors(pbmc, dims = 1:10)
#resolution 参数决定下游聚类分析得到的分群数,对于 3K 左右的细胞,设为 0.4-1.2 能得到较好的结果(官方说明);如果数据量增大,该参数也应该适当增大;增加的值会导致更多的群集。
pbmc <- FindClusters(pbmc, resolution = 0.5)
#使用 Idents()函数可查看不同细胞的分群;
head(Idents(pbmc), 8)
table(pbmc@active.ident) # 查看每一类有多少个细胞
#Seurat 提供了几种非线性降维的方法进行数据可视化(在低维空间把相似的细胞聚在一起),比如 UMAP 和 t-SNE,运行 UMAP 需要先安装'umap-learn'包,两种方法都可以使用,但不要混用,这样,后面的结算结果会将先前的聚类覆盖掉,只能保留一个
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages = 'umap-learn')
pbmc <- RunTSNE(pbmc, dims = 1:10)
#pbmc <- RunUMAP(object = pbmc, dims = 1:10)
#DimPlot(object = pbmc, reduction = "umap")
#用 DimPlot()函数绘制散点图, reduction = "tsne",指定绘制类型;如果不指定,默认先从搜索 umap, 然后 tsne, 再然后 pca;也可以直接使用这 3 个函数 PCAPlot()、 TSNEPlot()、UMAPPlot(); cols, pt.size 分别调整分组颜色和点的大小;
tsneplot<-TSNEPlot(pbmc,label = TRUE, pt.size = 1.5)+ NoLegend()
#保存数据
saveRDS(pbmc, file = "pbmc_tutorial.rds")
save(pbmc,file="pbmc.RData")
load(file = "pbmc.RData")
#寻找差异表达的特征(聚类生物标志物)
#寻找某个聚类和其他所有聚类显著表达的基因
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
head(x = cluster1.markers, n = 5)
#find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5, ident.2 = c(0,3), min.pct = 0.25)
# find markers for every cluster compared to all remaining cells, report only the positive ones
#pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, return.thresh = 0.01)
library(dplyr)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(object = pbmc, features = top10$gene) + NoLegend()
#画图
VlnPlot(object = pbmc, features = c("MS4A1", "CD79A"))
FeaturePlot(object = pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"))
# you can plot raw UMI counts as well
VlnPlot(object = pbmc, features.plot = c("NKG7", "PF4"), use.raw = TRUE, y.log = TRUE)
FeaturePlot(object = pbmc, features = c("hbaa1","hbaa2","ighv1-4","cd79a","cd79b","cd4-1","cd8a","cd8b","itga2b"), cols = c("grey", "blue"), reduction = "tsne")
#将单元类型标识分配给集群
new.cluster.ids <- c("1", "3", "4", "2", "5", "6", "7", "8")
names(x = new.cluster.ids) <- levels(x = pbmc)
pbmc <- RenameIdents(object = pbmc, new.cluster.ids)
DimPlot(object = pbmc, label = TRUE, pt.size = 1.5) + NoLegend()




未完待续。。。

CentOS 6.9 安装R-3.6.1

根据configure报错下载bzip2、curl、PCRE、xz-lzma、zlib对应的版本

如果是64位的系统,安装bzip2时修改Makefile文件,如下:

CC=gcc -fPIC 
AR=ar
RANLIB=ranlib
LDFLAGS=
BIGFILES=-D_FILE_OFFSET_BITS=64
CFLAGS=-fPIC -Wall -Winline -O2 -g  $(BIGFILES)   

安装好上边的模块后设置环境变量

export PATH=/home/wuchangsong/packages/bin:$PATH
export LD_LIBRARY_PATH=/home/wuchangsong/packages/lib:$LD_LIBRARY_PATH
export CFLAGS="-I/home/wuchangsong/packages/include"
export LDFLAGS="-L/home/wuchangsong/packages/lib"

根据报错做如下操作

sudo yum install texinfo
sudo yum install texlive
unzip inconsolata.zip
cp -Rfp inconsolata/* /usr/share/texmf/
sudo mktexlsr
./configure --prefix=/opt/sysoft/R-3.6.1 --enable-R-shlib  --with-readline=yes --with-libpng=yes --with-x=no
make -j 80
make install