ATAC-seq分析流程(三)

前边介绍了使用DiffBind进行ATAC-seq数据的差异分析,DiffBind进行差异分析时调用了edgeR或者DESeq2包;相对于DiffBind流程我更习惯普通转录组分析流程,因为两者本质上是一样的,区别在于ATAC-seq要根据结果的peak文件自己制备gtf文件,peak对应的gtf文件和gene是不一样的,不同样本间也可能存在差异。

cat *.narrowPeak > merged.peaks
sort -k1,1 -k2,2n merged.peaks > merged.peaks.sorted
bedtools merge -i merged.peaks.sorted > bedtools.merged.sorted.bed
awk -v var=ATAC_seq '{print $1"\t"var"\texon\t"$2"\t"$3"\t.\t+\t.\tgene_id \"peak_"NR"\"; transcript_id \"peak_"NR"\";"}' bedtools.merged.sorted.bed > bedtools.merged.atac.gtf
featureCounts -T 18 -p -t exon -g gene_id -a bedtools.merged.atac.gtf -o ATAC_counts.txt *last.bam
sed 1d ATAC_counts.txt | cut -f 1,7-15 > rawATAC_counts.matrix

一个peak相当于一个gene,得到的原始矩阵可以使用DESeq2包分析,筛选出共有特有或者差异的peaks等进行peaks的注释。

ATAC-seq分析流程(二)

六、Deeptools可视化

#deeptools安装conda install deeptools
gtf2bed < ~/gc_genome/17.Geta/1.geta/out.gtf > genome.bed
for i in `cat ../samples.txt`; do echo "bamCoverage -p 5 --normalizeUsing RPKM -b $i.last.bam -o $i.last.bw"; done > bam2bw.list
ParaFly -c bam2bw.list -CPU 9
computeMatrix reference-point  --referencePoint TSS  -p 15 -b 10000 -a 10000 -R ../genome.bed -S ../*bw --skipZeros  -o matrix1_test_TSS.gz --outFileSortedRegions regions1_test_genes.bed
plotHeatmap -m matrix1_test_TSS.gz  -out test_Heatmap.pdf --plotFileFormat pdf  --dpi 720
plotProfile -m matrix1_test_TSS.gz  -out test_Profile.pdf --plotFileFormat pdf --perGroup --dpi 720
computeMatrix scale-regions -p 150 -b 10000 -a 10000 -R ../genome.bed -S ../*bw --skipZeros  -o matrix1_test_body.gz --outFileSortedRegions regions1_test_body.bed
plotHeatmap -m matrix1_test_TSS.gz  -out test_Heatmap.pdf --plotFileFormat pdf  --dpi 720
plotProfile -m matrix1_test_TSS.gz  -out test_Profile.pdf --plotFileFormat pdf --perGroup --dpi 720

七、重复样品处理(至少两个生物学重复)
IDR是通过比较一对经过排序的regions/peaks 的列表,然后计算反映其重复性的值,同时得到了merge后的一致性的peaks。

for i in `cat ../../samples.txt`; do echo "sort -k8,8nr ../$i.peaks_peaks.narrowPeak > $i.peaks.narrowPeak"; done > peaks_sort.list
ParaFly -c peaks_sort.list -CPU 9

八、peaks注释(非模式物种)
使用GenomicFeatures包的函数制作TxDb对象:

#makeTxDbFromUCSC: 通过UCSC在线制作TxDb
#makeTxDbFromBiomart: 通过ensembl在线制作TxDb
#makeTxDbFromGRanges:通过GRanges对象制作TxDb
#makeTxDbFromGFF:通过解析GFF文件制作TxDb
gc_TxDB <- makeTxDbFromGFF("genome.gff3")
library(clusterProfiler)
library("ChIPseeker")
library(ChIPpeakAnno)
library(GenomicFeatures)
library("ggupset")
library("ggimage")
gc_TxDB <- makeTxDbFromGFF("out.gff3")
promoter <- getPromoters(TxDb=gc_TxDB, upstream=3000, downstream=3000)
files = list(A_summits = "A.peaks_summits.bed", B_summits = "B.peaks_summits.bed", C_summits = "C.peaks_summits.bed", D_summits = "D.peaks_summits.bed")
peakAnno <- annotatePeak(files[[1]], tssRegion=c(-3000, 3000), TxDb=gc_TxDB)
plotAnnoPie(peakAnno)
#tiff("peakAnno.tiff")
#plotAnnoPie(peakAnno)
#dev.off() 此方法保存的图片标注有颜色模块
plotAnnoBar(peakAnno)
vennpie(peakAnno)
peakAnnoList <- lapply(files, annotatePeak, TxDb=gc_TxDB,tssRegion=c(-3000, 3000))
plotAnnoBar(peakAnnoList)
#tagHeatmap来画热图
peak <- readPeakFile(files[[3]])
tagMatrix <- getTagMatrix(peak, windows=promoter)
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="red")
peakHeatmap(files, TxDb=gc_TxDB, 
    upstream=3000, downstream=3000, 
    color=rainbow(length(files)))
#以谱图的形式来展示结合的强度
plotAvgProf2(files[[3]], TxDb=gc_TxDB, 
			 upstream=3000, downstream=3000,
             xlab="Genomic Region (5'->3')", 
             ylab = "Read Count Frequency")
tagMatrixList <- lapply(files, getTagMatrix, 
                        windows=promoter)
plotAvgProf(tagMatrixList, xlim=c(-3000, 3000))
#注释基因
peakAnnoList <- lapply(files, annotatePeak, TxDb=gc_TxDB,tssRegion=c(-3000, 3000),addFlankGeneInfo=TRUE, flankDistance=5000)
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(genes)
save(peakAnnoList,file="peakAnnolist.rda")
write.table(as.data.frame(peakAnnoList$mulvscon),file="Nanog.PeakAnno",sep='\t',quote = F)

九、差异分析
DiffBind安装

conda install libv8
conda install -c conda-forge librsvg
##R version 4.0.3 (2020-10-10)
BiocManager::install("systemPipeR")
BiocManager::install("DiffBind")
library(DiffBind)
setwd("D:/Experiment_data/wcs/ATAC/")
tamoxifen <- dba(sampleSheet="sheet_ATAC.CSV")
tamoxifen <- dba.count(tamoxifen, bUseSummarizeOverlaps=TRUE)
plot(tamoxifen)
tamoxifen <- dba.contrast(tamoxifen, categories = DBA_FACTOR, minMembers = 3)
tamoxifen <- dba.analyze(tamoxifen, method = DBA_DESEQ2)
#heatmap
hmap <- colorRampPalette(c("green", "black", "red"))(n = 13)
dba.plotHeatmap(tamoxifen, correlations=FALSE, scale="row", colScheme = hmap)
#MA plots
dba.plotMA(tamoxifen, contrast = 2)
#火山图
dba.plotVolcano(tamoxifen)
tamoxifen.DB <- dba.report(tamoxifen,  method=DBA_DESEQ2, contrast = 1, th=1)
#write.csv(tamoxifen.DB,"diffbind.csv")
out <- as.data.frame(tamoxifen.DB)
write.table(out, file="tamoxifen_DB.csv", sep=",", quote=F, col.names = NA)
#保存不同样本的表达矩阵
out <- as.data.frame(tamoxifen[["binding"]])
write.table(out, file="deseq2_count.xls", sep="\t", quote=F, row.names=F, col.names=F)
#以bed格式保存显著性的差异结果
out <- as.data.frame(tamoxifen.DB)
deseq.bed <- out[ which(out$FDR < 0.05), 
                 c("seqnames", "start", "end", "strand", "Fold")]
write.table(deseq.bed, file="results/contrast_1_deseq2_sig.bed", sep="\t", quote=F, row.names=F, col.names=F)

参考链接:https://github.com/ENCODE-DCC/atac-seq-pipeline