ARDB和VFDB数据库比对后筛选

python .py -i db.fasta -I blastresult.tab -o selected.txt -O filtered.txt
blastresult.tab是根据P值sort后的文件

from __future__ import division
import re
import sys, getopt
import operator
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_nucleotide

opts, args = getopt.getopt(sys.argv[1:], "hI:i:o:O:")
input_info1= ""
input_info2= ""
out_file1 = ""
out_file2 = ""

for op, value in opts:
    if op =="-o": #ARG_pattern_MIN_7030.fasta or ARG_pattern_MAX_7030.fasta
        out_file1 = open(value, "w")
        filename_pro = str (value)
        name_item = filename_pro.strip().split(".")
        filename = name_item[0]
    elif op == "-i": # All_ARGcontig_remove_S_nr.fasta
        input_info1 = open(value, "r")
    elif op == "-I":
        input_info2 = open(value, "r")
    elif op == "-O":
        out_file2 = open(value, "w")
        filename_pro = str (value)
        name_item = filename_pro.strip().split(".")
        filename = name_item[0]      

dict1 = {} 
dict2 = {}
for record in SeqIO.parse(input_info1,"fasta"):
    h = len(record.seq)
   
    dict1[record.id]=h
    dict2[record.id]=record.description
    
        
for line1 in input_info2:
    info1 = line1.strip("\n").split("\t")
    a = float(info1[2])
    b = float(info1[3])
    c = b/float(dict1[info1[1]])
    if a >= 80.0 and c >= 0.8:
        out_file1.write(str(dict1[info1[1]])+'\t'+str(b)+'\t'+dict2[info1[1]]+'\t'+line1)
    else:
        out_file2.write(str(dict1[info1[1]])+'\t'+str(b)+'\t'+dict2[info1[1]]+'\t'+line1)
input_info1.close()
input_info2.close()
out_file1.close()
out_file2.close()

发表评论

电子邮件地址不会被公开。 必填项已用*标注