MUMmer使用及后续绘图

MUMmer被广泛用于大片段序列的比对,如染色体共线性分析。

nucmer [options]  
delta-filter -i 80 -l 1000 -r -q out.delta > out.rq.delta
#比对率大于80%,对比长度大于1000
#-r: 仅保留每个reference在query上的最佳位置,允许多条reference在query上重叠
#-q: 仅保留每个query在reference上的最佳位置,允许多条query在reference上重叠
show-coords out.rq.delta > out.coords
grep -P  "zfCh04\s+" out.coords|awk '{print $12,$13}' |sort |uniq -c #查看reference单个染色体zfCh04和query不同染色体的匹配区域的数量
grep -P  "zfCh04\s+gcCh04" out.coords|awk '{print $12,$1,$2,$13,$4,$5}' > zfCh04_gcCh04.txt
sed -i 's/ /\t/g' zfCh04_gcCh04.txt

#R作图
使用RIdeogram包,可参考RIdeogram:染色体数据可视化的R包

install.packages('RIdeogram')
require(RIdeogram)
cc <- read.table("111.xls",sep="\t",header = TRUE,stringsAsFactors = F)
dd <- read.table("222.xls",sep="\t",header = TRUE,stringsAsFactors = F)
ideogram(karyotype = dd, synteny = cc)
#data(karyotype_dual_comparison, package="RIdeogram")
#data(synteny_dual_comparison, package="RIdeogram")
ideogram(karyotype = karyotype_dual_comparison, synteny = synteny_dual_comparison)
convertSVG("chromosome.svg", device = "png")

karyotype_dual_comparison文件格式
Chr: 染色体号
Start: 起始
End: 终止
fill: 染色体填充色
species:物种名
size: 物种名字体大小
color: 物种名字体颜色

synteny_dual_comparison文件格式
Species_1:物种1染色体号
Start_1,End_1:物种1染色体区域位置
Species_2:物种2染色体号
Start_2,End_2:物种2染色体区域位置

此外还支持三个基因组的共线性

ATAC-seq分析流程(三)

前边介绍了使用DiffBind进行ATAC-seq数据的差异分析,DiffBind进行差异分析时调用了edgeR或者DESeq2包;相对于DiffBind流程我更习惯普通转录组分析流程,因为两者本质上是一样的,区别在于ATAC-seq要根据结果的peak文件自己制备gtf文件,peak对应的gtf文件和gene是不一样的,不同样本间也可能存在差异。

cat *.narrowPeak > merged.peaks
sort -k1,1 -k2,2n merged.peaks > merged.peaks.sorted
bedtools merge -i merged.peaks.sorted > bedtools.merged.sorted.bed
awk -v var=ATAC_seq '{print $1"\t"var"\texon\t"$2"\t"$3"\t.\t+\t.\tgene_id \"peak_"NR"\"; transcript_id \"peak_"NR"\";"}' bedtools.merged.sorted.bed > bedtools.merged.atac.gtf
featureCounts -T 18 -p -t exon -g gene_id -a bedtools.merged.atac.gtf -o ATAC_counts.txt *last.bam
sed 1d ATAC_counts.txt | cut -f 1,7-15 > rawATAC_counts.matrix

一个peak相当于一个gene,得到的原始矩阵可以使用DESeq2包分析,筛选出共有特有或者差异的peaks等进行peaks的注释。

ATAC-seq分析流程(二)

六、Deeptools可视化

#deeptools安装conda install deeptools
gtf2bed < ~/gc_genome/17.Geta/1.geta/out.gtf > genome.bed
for i in `cat ../samples.txt`; do echo "bamCoverage -p 5 --normalizeUsing RPKM -b $i.last.bam -o $i.last.bw"; done > bam2bw.list
ParaFly -c bam2bw.list -CPU 9
computeMatrix reference-point  --referencePoint TSS  -p 15 -b 10000 -a 10000 -R ../genome.bed -S ../*bw --skipZeros  -o matrix1_test_TSS.gz --outFileSortedRegions regions1_test_genes.bed
plotHeatmap -m matrix1_test_TSS.gz  -out test_Heatmap.pdf --plotFileFormat pdf  --dpi 720
plotProfile -m matrix1_test_TSS.gz  -out test_Profile.pdf --plotFileFormat pdf --perGroup --dpi 720
computeMatrix scale-regions -p 150 -b 10000 -a 10000 -R ../genome.bed -S ../*bw --skipZeros  -o matrix1_test_body.gz --outFileSortedRegions regions1_test_body.bed
plotHeatmap -m matrix1_test_TSS.gz  -out test_Heatmap.pdf --plotFileFormat pdf  --dpi 720
plotProfile -m matrix1_test_TSS.gz  -out test_Profile.pdf --plotFileFormat pdf --perGroup --dpi 720

七、重复样品处理(至少两个生物学重复)
IDR是通过比较一对经过排序的regions/peaks 的列表,然后计算反映其重复性的值,同时得到了merge后的一致性的peaks。

for i in `cat ../../samples.txt`; do echo "sort -k8,8nr ../$i.peaks_peaks.narrowPeak > $i.peaks.narrowPeak"; done > peaks_sort.list
ParaFly -c peaks_sort.list -CPU 9

八、peaks注释(非模式物种)
使用GenomicFeatures包的函数制作TxDb对象:

#makeTxDbFromUCSC: 通过UCSC在线制作TxDb
#makeTxDbFromBiomart: 通过ensembl在线制作TxDb
#makeTxDbFromGRanges:通过GRanges对象制作TxDb
#makeTxDbFromGFF:通过解析GFF文件制作TxDb
gc_TxDB <- makeTxDbFromGFF("genome.gff3")
library(clusterProfiler)
library("ChIPseeker")
library(ChIPpeakAnno)
library(GenomicFeatures)
library("ggupset")
library("ggimage")
gc_TxDB <- makeTxDbFromGFF("out.gff3")
promoter <- getPromoters(TxDb=gc_TxDB, upstream=3000, downstream=3000)
files = list(A_summits = "A.peaks_summits.bed", B_summits = "B.peaks_summits.bed", C_summits = "C.peaks_summits.bed", D_summits = "D.peaks_summits.bed")
peakAnno <- annotatePeak(files[[1]], tssRegion=c(-3000, 3000), TxDb=gc_TxDB)
plotAnnoPie(peakAnno)
#tiff("peakAnno.tiff")
#plotAnnoPie(peakAnno)
#dev.off() 此方法保存的图片标注有颜色模块
plotAnnoBar(peakAnno)
vennpie(peakAnno)
peakAnnoList <- lapply(files, annotatePeak, TxDb=gc_TxDB,tssRegion=c(-3000, 3000))
plotAnnoBar(peakAnnoList)
#tagHeatmap来画热图
peak <- readPeakFile(files[[3]])
tagMatrix <- getTagMatrix(peak, windows=promoter)
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="red")
peakHeatmap(files, TxDb=gc_TxDB, 
    upstream=3000, downstream=3000, 
    color=rainbow(length(files)))
#以谱图的形式来展示结合的强度
plotAvgProf2(files[[3]], TxDb=gc_TxDB, 
			 upstream=3000, downstream=3000,
             xlab="Genomic Region (5'->3')", 
             ylab = "Read Count Frequency")
tagMatrixList <- lapply(files, getTagMatrix, 
                        windows=promoter)
plotAvgProf(tagMatrixList, xlim=c(-3000, 3000))
#注释基因
peakAnnoList <- lapply(files, annotatePeak, TxDb=gc_TxDB,tssRegion=c(-3000, 3000),addFlankGeneInfo=TRUE, flankDistance=5000)
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(genes)
save(peakAnnoList,file="peakAnnolist.rda")
write.table(as.data.frame(peakAnnoList$mulvscon),file="Nanog.PeakAnno",sep='\t',quote = F)

九、差异分析
DiffBind安装

conda install libv8
conda install -c conda-forge librsvg
##R version 4.0.3 (2020-10-10)
BiocManager::install("systemPipeR")
BiocManager::install("DiffBind")
library(DiffBind)
setwd("D:/Experiment_data/wcs/ATAC/")
tamoxifen <- dba(sampleSheet="sheet_ATAC.CSV")
tamoxifen <- dba.count(tamoxifen, bUseSummarizeOverlaps=TRUE)
plot(tamoxifen)
tamoxifen <- dba.contrast(tamoxifen, categories = DBA_FACTOR, minMembers = 3)
tamoxifen <- dba.analyze(tamoxifen, method = DBA_DESEQ2)
#heatmap
hmap <- colorRampPalette(c("green", "black", "red"))(n = 13)
dba.plotHeatmap(tamoxifen, correlations=FALSE, scale="row", colScheme = hmap)
#MA plots
dba.plotMA(tamoxifen, contrast = 2)
#火山图
dba.plotVolcano(tamoxifen)
tamoxifen.DB <- dba.report(tamoxifen,  method=DBA_DESEQ2, contrast = 1, th=1)
#write.csv(tamoxifen.DB,"diffbind.csv")
out <- as.data.frame(tamoxifen.DB)
write.table(out, file="tamoxifen_DB.csv", sep=",", quote=F, col.names = NA)
#保存不同样本的表达矩阵
out <- as.data.frame(tamoxifen[["binding"]])
write.table(out, file="deseq2_count.xls", sep="\t", quote=F, row.names=F, col.names=F)
#以bed格式保存显著性的差异结果
out <- as.data.frame(tamoxifen.DB)
deseq.bed <- out[ which(out$FDR < 0.05), 
                 c("seqnames", "start", "end", "strand", "Fold")]
write.table(deseq.bed, file="results/contrast_1_deseq2_sig.bed", sep="\t", quote=F, row.names=F, col.names=F)

参考链接:https://github.com/ENCODE-DCC/atac-seq-pipeline

BS-seq分析流程(一)

短序列的质控都可以使用trimmomatic,这里不多做介绍,得到的clean data可做下面分析
一、比对和甲基化位点提取(Bismark)
Bismark安装及使用

git clone https://github.com/FelixKrueger/Bismark.git
#conda install -c bioconda bismark
#Genome Preparation
/opt/biosoft/Bismark-0.23.0/bismark_genome_preparation --parallel 40 --verbose ./
#bismark alignment
for i in `cat ../samples.txt`
do
    echo "/opt/biosoft/Bismark-0.23.0/bismark --parallel 40 --genome ./ --phred33-quals -1 ../$i.1.fastq -2 ../$i.2.fastq"
done > bismark.list
ParaFly -c bismark.list -CPU 4
#deduplicate 
for i in `cat ../samples.txt`
do
    echo "/opt/biosoft/Bismark-0.23.0/deduplicate_bismark --bam $i.1_bismark_bt2_pe.bam"
done > deduplicate_bismark.list
ParaFly -c deduplicate_bismark.list -CPU 9
#bismark_methylation_extractor提取甲基化信息
for i in `ls *deduplicated.bam`
do
    echo "/opt/biosoft/Bismark-0.23.0/bismark_methylation_extractor $i -p --genome ./ --gzip --bedGraph --cytosine_report --CX --buffer_size 20G --output ./"
done > methylation_extractor.list
ParaFly -c methylation_extractor.list -CPU 9
/opt/biosoft/Bismark-0.23.0/bismark2report
/opt/biosoft/Bismark-0.23.0/bismark2summary
for i in `cat ../samples.txt`
do
    echo "/opt/biosoft/Bismark-0.23.0/coverage2cytosine $i.1_bismark_bt2_pe.deduplicated.bismark.cov.gz --merge_CpG --genome ./ -o $i.CpG.output"
done > coverage2cytosine.list
ParaFly -c coverage2cytosine.list -CPU 9
/opt/biosoft/Bismark-0.23.0/bam2nuc --genome_folder ./ --genomic_composition_only
for i in `cat ../samples.txt`
do
    echo "/opt/biosoft/Bismark-0.23.0/bam2nuc --genome_folder ./ $i.1_bismark_bt2_pe.deduplicated.bam"
done > bam2nuc.list
ParaFly -c bam2nuc.list -CPU 9



/opt/biosoft/Bismark-0.23.0/bismark2bedGraph -o buffer_CpG.bedGraph --buffer 5G CpG_*

ATAC-seq分析流程(一)

首先根据今年发表在Genome Biology上一篇综述总览分析流程:

作者建议的预处理分析流程为:FastQC-trimmomatic-BWA-MEM-ATACseqQC。
一、使用Fastqc对fastq 文件的测序质量统计

for i in `ls *fastq`; do echo "fastqc -t 4 -o ./ $i"; done > fastqc.sh
nohup ParaFly -c fastqc.sh -CPU 18 &

二、Trimmomatic质控(可使用 cutadapt,AdapterRemoval v2,Skewer 和 trimmomatic 等软件)
The forward and reverse adapters are slightly different. We will also trim low quality bases at the ends of the reads (quality less than 20). We will only keep reads that are at least 20 bases long. We remove short reads (< 20bp) as they are not useful, they will either be thrown out by the mapping or may interfere with our results at the end.

mkdir 1.trimmomatic
cd 1.trimmomatic/
for i in `cat ../samples.txt`
do
    echo "java -jar /opt/biosoft/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 20 ../$i.1.fastq ../$i.2.fastq $i.1.fastq $i.1.unpaired.fastq $i.2.fastq $i.2.unpaired.fastq ILLUMINACLIP:/opt/biosoft/Trimmomatic-0.38/adapters/TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 TOPHRED33 2> $i.trimmomatic.log"
done > command.trimmomatic.list
ParaFly -c command.trimmomatic.list -CPU 9
cd ..

三、Mapping Reads to Reference Genome

mkdir 2.bowtie2 && cd 2.bowtie2
bowtie2-build --threads 160 genome.fasta genome
for i in `cat ../samples.txt`
do
    echo "bowtie2 -x genome -1 ../1.trimmomatic/$i.1.fastq -2 ../1.trimmomatic/$i.2.fastq -p 160 -I 0 -X 1000 --very-sensitive -S $i.bowtie2.sam 2> $i.bowtie2.log"
done > bowtie2.list
ParaFly -c bowtie2.list -CPU 9
for i in `cat ../samples.txt`
do
    echo "samtools sort -@ 40 -m 8G -o $i.bowtie2.bam -O BAM $i.bowtie2.sam"
done > sam2bam.list
ParaFly -c sam2bam.list -CPU 9
cd ..

四、Filtering Mapped Reads
1、Filter Duplicate Reads
去除重复可选择samtools、picard和sambamba,其中sambamba的操作速度最快,推荐使用,安装推荐使用conda

sambamba markdup -r --overflow-list-size 600000  --tmpdir='./' ../2.bowtie2/$i.bowtie2.bam $i.sambamba.rmdup.bam

–overflow-list-size=OVERFLOWLISTSIZE
size of the overflow list where reads, thrown away from the hash table, get a second chance to meet their pairs (default is 200000 reads); increasing the size reduces the number of temporary files created
-r, –remove-duplicates
remove duplicates instead of just marking them
-t, –nthreads=NTHREADS
number of threads to use
BUGS
External sort is not implemented. Thus, memory consumption grows by 2Gb per each 100M reads. Check that you have enough RAM before running the tool.

2、Filter Uninformative Reads

samtools view -h -f 2 -q 30 $i.sambamba.rmdup.bam |grep -v pilon373| samtools sort -O bam -@ 40 -o - > $i.last.bam
samtools index $i.last.bam
samtools index $i.sambamba.rmdup.bam
samtools flagstat $i.sambamba.rmdup.bam > $i.sambamba.rmdup.stat
samtools flagstat $i.last.bam > $i.last.stat
bedtools bamtobed -i $i.last.bam > $i.bed

3、Check Insert Sizes

for i in `cat ../samples.txt`
do
    echo "samtools view $i.last.bam |awk '{print $9}'  > $i.last_length.txt"
done > last_bam_length.list
ParaFly -c last_bam_length.list -CPU 9

数据可导入R绘图
五、Peak calling(详细参数参考https://www.jianshu.com/p/21e8c51fca23

macs2 callpeak -t ../3.filtering/$i.bed -g 8.9e8 --nomodel --shift -100 --extsize 200 -n $i.peaks --outdir ./

Upset plot:进阶版Venn plot实现多元素可视化

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("UpSetR")
library(UpSetR)
pathway=read.table("carp_combined_id1.xls",header=T,sep="\t")
upset(pathway,nsets = 9, nintersects = 50,mb.ratio = c(0.5, 0.5),
      order.by = "freq", decreasing = c(TRUE,FALSE),
      queries = list(list(query=intersects, params=list("datra", "caaur","meamb","onmac","ctide","cycar","sigra","sians","sirhi"), color="red", active=T)))
#详细参数查看官网
#行列转置awk '{for(i=0;++i<=NF;)a[i]=a[i]?a[i] FS $i:$i}END{for(i=0;i++ carp_combined_absentid_convert.txt

R画折线图

library(ggplot2)
library(gcookbook)
data <- read.table("test.xls",header=T,sep="\t")
ggplot(data,aes(x=Species, y=CDS))+ geom_line(color="red") + geom_point(size=2, shape=16,color="red") + ylim(0,1750) + theme_classic() + scale_x_discrete(limits=c("darer","datra","ctide","meamb","onmac","cycar","caaur","sirhi","sians","sigra"))
#scale_x_discrete修改x轴坐标,不改变图的数值
#theme_classic显示X、Y轴线

多物种全基因组比对得到保守的DNA序列

查阅不同文献和教程发现获得多物种保守的DNA序列(编码区和非编码区)主要过程有:
1.Repeat mask:通过RepeatMasker和RepeatModeler获得
2.Pairwise alignment: 用到的软件主要有last、lastz、blastz
3.Chaining: axtChain
4.Netting: chainNet
5.Mafing:
6.Combine multiple pairwise results:
7.PhastCons: PHAST

详细步骤如下

第一步:从数据库下载重复序列屏蔽后的基因组fasta文件,对自己组装的序列可以通过Geta获得
第二步:前边介绍过last的使用,但看文献发现使用lastz的比较多,有关last和lastz的比较(last aligner is considered faster and memory efficient. It creates maf file, which can converted to psl files. Then the same following processes can be used on psl files. Different from lastz, last aligner starts with fasta files. The target genome sequence has to build the index file first, and then align with the query genome sequence.),操作上last使用起来更加简单,参数选择较少,目前还不知道两者结果的异同(服务器正在运行,结果出来更新)。lastz和blastz的不同
lastz数据预处理:

for i in `cat 00.initial_data/reflect.txt | cut -f 1`
do
    echo "faToTwoBit 00.initial_data/$i.genome.fasta 00.initial_data/$i.genome.2bit"
done > fa2bit.list
ParaFly -c fa2bit.list -CPU 19

for i in `cat 00.initial_data/reflect.txt | cut -f 1`
do
    echo "twoBitInfo 00.initial_data/$i.genome.2bit stdout | sort -k2rn > $i.chrom.sizes"
done > chrom.sizes.list
ParaFly -c chrom.sizes.list -CPU 19
for i in `cat 00.initial_data/reflect.txt | cut -f 1`
do
    mkdir ${i}PartList
done
for i in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
do
    echo "/opt/biosoft/userApps/kent/src/hg/utils/automation/partitionSequence.pl 10000000 0 00.initial_data/$i.genome.2bit $i.chrom.sizes 1 -lstDir ${i}PartList > $i.part.list"
done > query_partitionSequence.list
ParaFly -c query_partitionSequence.list -CPU 18
/opt/biosoft/userApps/kent/src/hg/utils/automation/partitionSequence.pl 20000000 10000 00.initial_data/darer.genome.2bit darer.chrom.sizes 1 -lstDir darerPartList > darer.part.list
grep -v PartList darer.part.list > darer.list
for i in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
do
    echo "grep -v PartList $i.part.list > $i.list"
done > 1111.lsit
ParaFly -c 1111.lsit -CPU 18
for i in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
do
    echo "cat ${i}PartList/*.lst >> $i.list"
done > cat_Part.list
ParaFly -c cat_Part.list -CPU 18
/opt/biosoft/userApps/kent/src/hg/utils/automation/constructLiftFile.pl darer.chrom.sizes darer.list > darer.lift
for i in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
do
    echo "/opt/biosoft/userApps/kent/src/hg/utils/automation/constructLiftFile.pl $i.chrom.sizes $i.list > $i.lift"
done > constructLiftFile.list
ParaFly -c constructLiftFile.list -CPU 18
for i in `cat 00.initial_data/reflect.txt | cut -f 1`
do
    mkdir $i
    for x in `cat $i.list`
    do
        y=${x/*2bit:/}
        echo "twoBitToFa $x $i/$y.fa"
    done >> twoBitToFa.list
done
#去除长度小于1000bp的序列
for i in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
do
    for x in $i/*fa
    do
        y=${x/*-/}
        k=${y/.fa/}
        if [ $k -le 1000 ]
        then
            rm $x
        fi

    done
done

ParaFly -c twoBitToFa.list -CPU 80
for i in darer/*fa
do
    for x in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
    do
        for y in $x/*fa
        do
            echo "lastz $i $y --strand=both --seed=12of19 --notransition --chain --gapped --gap=400,30 --hspthresh=3000 --gappedthresh=3000 --inner=2000 --masking=50 --ydrop=9400 --scores=/opt/biosoft/GenomeAlignmentTools/HoxD55.q --format=axt > ${x}_axt/$i.$y.axt"
        done >> lastz_all.list
    done
done

#lastz的运行速度太慢,若分析的物种太多没有超算不推荐使用

第三步:Chaining,将相邻的block连接起来,打分矩阵和blastz相同,gap打分改变

for i in `cat 00.initial_data/reflect_no_darer.txt | cut -f 1`
do
    echo "axtChain -linearGap=loose -psl $i.psl darer.genome.2bit $i.genome.2bit $i.Todarer.chain"
done > chain_axtChain.list
ParaFly -c chain_axtChain.list -CPU 18

第四步:Netting:chainNet,对target序列确定最优比对序列。
1.首先将所有的染色体或scaffold的碱基标记未用的。
2.按打分由高到低排列,形成列表。
3.迭代:每次从列表中取出一个chain,扔掉与已经存在的chain有overlap的区域,余下的部分添加上去,如果和之前的chain有gap,标记成子集,通过这种方式形成的层级称为net。记录overlap的区域,用于下一步识别重复。

chainMergeSort $output_dir/3.chain/*.chain > $output_dir/4.prenet/all.chain
chainPreNet $output_dir/4.prenet/all.chain $output_dir/$tn.sizes $output_dir/$qn.sizes $output_dir/4.prenet/all_sort.chain
chainNet $output_dir/4.prenet/all_sort.chain $output_dir/$tn.sizes $output_dir/$qn.sizes $output_dir/5.net/temp.tn $output_dir/5.net/temp.qn
netSyntenic $output_dir/5.net/temp.tn $output_dir/5.net/$tn.net
netSyntenic $output_dir/5.net/temp.qn $output_dir/5.net/$qn.net

第五步:Mafing

netToAxt $output_dir/5.net/$tn.net $output_dir/4.prenet/all_sort.chain $output_dir/$tn.2bit $output_dir/$qn.2bit $output_dir/6.net_to_axt/all.axt
axtSort $output_dir/6.net_to_axt/all.axt $output_dir/6.net_to_axt/all_sort.axt
axtToMaf -tPrefix=$tn -qPrefix=$qn $output_dir/6.net_to_axt/all_sort.axt $output_dir/$tn.sizes $output_dir/$qn.sizes $output_dir/7.maf/all.maf

第六步:Combine multiple pairwise results:

roast + E=darer tree.txt ./*Final.maf all.roast.maf

PSMC分析流程

bowtie2-build ../genome.fasta genome
bowtie2 -x genome -p 80 -1 reads.1.fastq -2 reads.2.fastq -S bowtie2.sam
samtools sort -o bowtie2_sort.bam -O BAM -@ 40 -m 4G bowtie2.sam
/opt/biosoft/samtools-0.1.18/samtools mpileup -C50 -uf ../genome.fasta bowtie2_sort.bam > gc_psmc.bcf
/opt/biosoft/samtools-0.1.18/bcftools/bcftools view -c gc_psmc.bcf > Pb_2G.vcf
vcfutils.pl vcf2fq -d 10 -D 100 Pb_2G.vcf | gzip > diploid.fq.gz
/opt/biosoft/psmc-master/utils/fq2psmcfa -q20 diploid.fq.gz > diploid.psmcfa
/opt/biosoft/psmc-master/utils/splitfa diploid.psmcfa > split.psmcfa
/opt/biosoft/psmc-master/psmc -N25 -t15 -r5 -p "4+25*2+4+6" -o diploid.psmc diploid.psmcfa
seq 100 | xargs -i echo /opt/biosoft/psmc-master/psmc -N25 -t15 -r5 -b -p "4+25*2+4+6" -o round-{}.psmc split.fa | sh
cat diploid.psmc round-*.psmc > combined.psmc
/opt/biosoft/psmc-master/utils/psmc_plot.pl -g x -u y combined combined.psmc
# x为该物种繁殖一代的时间,比如人的默认为25年,该处值为-g 25.
# y为该物种碱基替换率,可由进化树的枝长除以r8s评估该物种的分歧时间得到

使用AdmixTools做D-statistics

安装软件和缺少的库文件

git clone https://github.com/DReichLab/AdmixTools.git
cd AdmixTools/src
make clobber
make all
#如果报错/usr/bin/ld: cannot find -lopenblas说明缺少libopenblas库文件
git clone https://github.com/xianyi/OpenBLAS.git
cd OpenBLAS
make
make PREFIX=/path/to/your/installation install
cd /usr/lib/
ln -s /opt/biosoft/OpenBLAS/lib/libopenblas_nehalemp-r0.3.10.dev.a ./libopenblas.a
ln -s /opt/biosoft/OpenBLAS/lib/libopenblas_nehalemp-r0.3.10.dev.so ./libopenblas.so
cd /opt/biosoft/AdmixTools/src/
make clean
make all && make install