Anvi’o 安装

Dependencies

  • DIAMOND or NCBI’s blastp for search.
  • MCL for clustering.
  • muscle for alignment.

easy install through conda:

 

wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod 777 Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
#在询问是否将conda加入环境变量的时候选择no
cd miniconda3/bin/
chmod 777 activate
. ./activate
#添加频道
conda config --env --add channels conda-forge
conda config --env --add channels bioconda

conda create -n anvio-6 python=3.6
conda activate anvio-6
conda install -y anvio=6
conda install -y diamond=0.9.14

具体用法 anvi’o官网教程:http://merenlab.org/2016/11/08/pangenomics-v2/

提取单拷贝同源基因

anvi-get-sequences-for-gene-clusters -p AH_all_Pan-PAN.db -g ../PROCHLORO-GENOMES.db -o SCG.fasta --max-num-genes-from-each-genome 1 --min-num-genomes-gene-cluster-occurs 85
grep '>' SCG.fasta | wc -l
#结果除以85就是单拷贝同源基因的数量
anvi-get-sequences-for-gene-clusters -p AH_all_Pan-PAN.db -g ../PROCHLORO-GENOMES.db -o Singletons.fasta -C default -b Singletons
#提取感兴趣的bin

MLST细菌分型

多位点序列分型(multilocus sequence typing,MLST)是一种基于核酸序列测定的细菌分型方法。这种方法通过PCR扩增多个管家基因内部片段并测定其序列,分析菌株的变异。

依赖conda安装

conda install -c conda-forge -c bioconda -c defaults mlst
mlst contigs.fa
contigs.fa  neisseria  11149  abcZ(672) adk(3) aroE(4) fumC(3) gdh(8) pdhC(4) pgm(6)

mlst genomes/*
genomes/6008.fna        saureus         239  arcc(2)   aroe(3)   glpf(1)   gmk_(1)   pta_(4)   tpi_(4)   yqil(3)
genomes/strep.fasta.gz  ssuis             1  aroA(1)   cpn60(1)  dpr(1)    gki(1)    mutS(1)   recA(1)   thrA(1)
genomes/NC_002973.gbk   lmonocytogenes    1  abcZ(3)   bglA(1)   cat(1)    dapE(1)   dat(3)    ldh(1)    lhkA(3)
genomes/L550.gbk.bz2    leptospira      152  glmU(26)  pntA(30)  sucA(28)  tpiA(35)  pfkB(39)  mreA(29)  caiB(29)

mlst --scheme neisseria NM*
NM003.fa   neisseria  4821  abcZ(222)  adk(3)  aroE(58)  fumC(275)  gdh(30)  pdhC(5)  pgm(255)
NM005.gbk  neisseria  177   abcZ(7)    adk(8)  aroE(10)  fumC(38)   gdh(10)  pdhC(1)  pgm(20)
NM011.fa   neisseria  11    abcZ(2)    adk(3)  aroE(4)   fumC(3)    gdh(8)   pdhC(4)  pgm(6)
NMC.gbk.gz neisseria  8     abcZ(2)    adk(3)  aroE(7)   fumC(2)    gdh(8)   pdhC(5)  pgm(2)

mlst --legacy --scheme neisseria *.fa
FILE      SCHEME     ST    abcZ  adk  aroE  fumC  gdh  pdhC  pgm
NM003.fa  neisseria  11    2     3    4     3       8     4    6
NM009.fa  neisseria  11149 672   3    4     3       8     4    6
MN043.fa  neisseria  11    2     3    4     3       8     4    6
NM051.fa  neisseria  11    2     3    4     3       8     4    6
NM099.fa  neisseria  1287  2     3    4    17       8     4    6
NM110.fa  neisseria  11    2     3    4     3       8     4    6

链接:Mlst官网

ARDB和VFDB数据库比对后筛选

python .py -i db.fasta -I blastresult.tab -o selected.txt -O filtered.txt
blastresult.tab是根据P值sort后的文件

from __future__ import division
import re
import sys, getopt
import operator
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_nucleotide

opts, args = getopt.getopt(sys.argv[1:], "hI:i:o:O:")
input_info1= ""
input_info2= ""
out_file1 = ""
out_file2 = ""

for op, value in opts:
    if op =="-o": #ARG_pattern_MIN_7030.fasta or ARG_pattern_MAX_7030.fasta
        out_file1 = open(value, "w")
        filename_pro = str (value)
        name_item = filename_pro.strip().split(".")
        filename = name_item[0]
    elif op == "-i": # All_ARGcontig_remove_S_nr.fasta
        input_info1 = open(value, "r")
    elif op == "-I":
        input_info2 = open(value, "r")
    elif op == "-O":
        out_file2 = open(value, "w")
        filename_pro = str (value)
        name_item = filename_pro.strip().split(".")
        filename = name_item[0]      

dict1 = {} 
dict2 = {}
for record in SeqIO.parse(input_info1,"fasta"):
    h = len(record.seq)
   
    dict1[record.id]=h
    dict2[record.id]=record.description
    
        
for line1 in input_info2:
    info1 = line1.strip("\n").split("\t")
    a = float(info1[2])
    b = float(info1[3])
    c = b/float(dict1[info1[1]])
    if a >= 80.0 and c >= 0.8:
        out_file1.write(str(dict1[info1[1]])+'\t'+str(b)+'\t'+dict2[info1[1]]+'\t'+line1)
    else:
        out_file2.write(str(dict1[info1[1]])+'\t'+str(b)+'\t'+dict2[info1[1]]+'\t'+line1)
input_info1.close()
input_info2.close()
out_file1.close()
out_file2.close()

ARDB注释抗药基因

wget -c ftp://ftp.cbcb.umd.edu/pub/data/ARDB/ARDBflatFiles.tar.gz
tar -zxvf ARDBflatFiles.tar.gz
wget -c ftp://ftp.cbcb.umd.edu/pub/data/ARDB/ardbAnno1.0.tar.gz
tar -zxvf ardbAnno1.0.tar.gz
makeblastdb -in resisGenes.pfasta -dbtype prot -out ARDB
vim genomeList.tab #目的蛋白序列路径
perl ardbAnno.pl