ゲノムアノテーション情報の記述に良く使われるGFF3形式を、DDBJへのゲノミックデータと登録に要求されるアノテーション方式(MSS)に変換するためのスクリプトを作成しました。
GFF2MSS.py genome.gff
という形で使って下さい。
###########
2023/05/17追記
tDNA, rDNA, N-gap に対応したバージョンをgithubにあげていますので、こちらをご利用ください。
############
バグ取りが完全ではないので使用注意です。
タンパク質コード領域に対応しています。
登録者情報などを記述する部分(COMMON領域)は手打ちして下さい。
下記に想定しているGFF3ファイルの例を置きます
https://github.com/billzt/gff3sort
で処理したりすると良いです
GFF2MSS.py
#!/usr/bin/python
# coding: UTF-8
import sys
from Bio import SeqIO
from Bio import Seq
from BCBio import GFF
args = sys.argv
in_file = args[1]
PreContig = ""
Contig_Count = 0
in_handle = open(in_file)
for rec in GFF.parse(in_handle):
NowContig = rec.id
position = rec.annotations["sequence-region"]
NowPosition = position[Contig_Count]
NowPosEnd = str(NowPosition[2])
if PreContig != NowContig:
print NowContig + "\t" + "source" + "\t" + str(1) + ".." + NowPosEnd + "\t" + "ff_definition" + "\t" + "@@[organism]@@ DNA, contig: " + NowContig
print "\t" + "\t" + "\t" + "note" + "\t" + "contig: " + NowContig
PreContig = rec.id
for gene_f in rec.features:
for mRNA_f in gene_f.sub_features:
COUNT = 0 #新しいmRNAに入ったらcountを0にする
out_STRAND=""
out_STRAND_CLOSE=""
POSITION="" #各出力項目を初期化
out_JOINT = ""
out_JOINT_CLOSE=""
strand = mRNA_f.strand
if strand == -1:
out_STRAND = "complement("
out_STRAND_CLOSE = ")"
####GENE_INFORMATIONS
mRNA_ID = mRNA_f.qualifiers["ID"]
locus_tag_ID = mRNA_f.qualifiers["Note"]
product_name = mRNA_f.qualifiers.get("product", ["Unknown_product"])
####
for CDS_f in mRNA_f.sub_features:
COUNT += 1
transl_table = CDS_f.qualifiers.get("transl_table", ["1"])
if COUNT==1: #該当mRNAにおける最初のCDS
CDS_START = CDS_f.location.start +1
CDS_END = CDS_f.location.end
POSITION = POSITION + str(CDS_START) + ".." + str(CDS_END)
else: #該当mRNAにおけ二番目以降のCDS
CDS_START = CDS_f.location.start +1
CDS_END = CDS_f.location.end
POSITION = POSITION + ","+ str(CDS_START) + ".." + str(CDS_END)
out_JOINT = "join("
out_JOINT_CLOSE = ")"
print "\tCDS\t"+ out_STRAND + out_JOINT + POSITION + out_JOINT_CLOSE + out_STRAND_CLOSE + "\tcodon_start\t1"
print "\t\t\t" + "locus_tag\t" + locus_tag_ID[0]
print "\t\t\t" + "note\t" + mRNA_ID[0]
print "\t\t\t" + "product\t" + product_name[0]
print "\t\t\t" + "transl_table\t" + transl_table[0]
Contig_Count += 1
if Contig_Count == len(position):
break
in_handle.close()
example.gff
##sequence-region unitig_0 1 307079
unitig_0 . gene 1137 4305 . - . ID=LOCUS_TAG_0000100;Note=g35293
unitig_0 . mRNA 1137 4305 . - . ID=g35293.t1;Note=LOCUS_TAG_0000100;Parent=LOCUS_TAG_0000100;product=hypothetical protein
unitig_0 . CDS 1137 1462 . - 1 ID=g35293.t1_1;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 2201 2583 . - 1 ID=g35293.t1_2;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 2900 3031 . - 1 ID=g35293.t1_3;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 3381 3597 . - 1 ID=g35293.t1_4;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 3666 4073 . - 1 ID=g35293.t1_5;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 4278 4305 . - 1 ID=g35293.t1_6;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . gene 4844 5880 . - . ID=LOCUS_TAG_0000200;Note=g35294
unitig_0 . mRNA 4844 5880 . - . ID=g35294.t1;Note=LOCUS_TAG_0000200;Parent=LOCUS_TAG_0000200;product=hypothetical protein
unitig_0 . CDS 4844 5544 . - 1 ID=g35294.t1_1;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0 . CDS 5628 5880 . - 1 ID=g35294.t1_2;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0 . gene 6030 6836 . - . ID=LOCUS_TAG_0000300;Note=g35295
unitig_0 . mRNA 6030 6836 . - . ID=g35295.t1;Note=LOCUS_TAG_0000300;Parent=LOCUS_TAG_0000300;product=hypothetical protein
unitig_0 . CDS 6030 6335 . - 1 ID=g35295.t1_1;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6427 6468 . - 1 ID=g35295.t1_2;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6539 6739 . - 1 ID=g35295.t1_3;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6819 6836 . - 1 ID=g35295.t1_4;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . gene 13014 14367 . + . ID=LOCUS_TAG_0000400;Note=g35296
unitig_0 . mRNA 13014 14367 . + . ID=g35296.t1;Note=LOCUS_TAG_0000400;Parent=LOCUS_TAG_0000400;product=hypothetical protein
unitig_0 . CDS 13014 13016 . + 1 ID=g35296.t1_1;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13106 13201 . + 1 ID=g35296.t1_2;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13271 13316 . + 1 ID=g35296.t1_3;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13439 13638 . + 1 ID=g35296.t1_4;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13716 13893 . + 1 ID=g35296.t1_5;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13961 14119 . + 1 ID=g35296.t1_6;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 14186 14367 . + 1 ID=g35296.t1_7;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287