Python
bioinformatics
Biopython

Convert GFF3 to MSS for DDBJ submission

ゲノムアノテーション情報の記述に良く使われるGFF3形式を、DDBJへのゲノミックデータと登録に要求されるアノテーション方式(MSS)に変換するためのスクリプトを作成しました。

GFF2MSS.py genome.gff

という形で使って下さい。
バグ取りが完全ではないので使用注意です。
タンパク質コード領域に対応しています。
登録者情報などを記述する部分(COMMON領域)は手打ちして下さい。

下記に想定しているGFF3ファイルの例を置きます

GFF2MSS.py
#!/usr/bin/python
# coding: UTF-8



import sys
from Bio import SeqIO
from Bio import Seq
from BCBio import GFF

args = sys.argv
in_file = args[1]
PreContig = ""
Contig_Count = 0


in_handle = open(in_file)
for rec in GFF.parse(in_handle):
    NowContig = rec.id
    position = rec.annotations["sequence-region"]
    NowPosition = position[Contig_Count]
    NowPosEnd = str(NowPosition[2])
    if PreContig != NowContig:
        print NowContig + "\t" + "source" + "\t" + str(1) + ".." + NowPosEnd + "\t" + "ff_definition" + "\t" + "@@[organism]@@ DNA, contig: " + NowContig
        print "\t" + "\t" + "\t" + "note" + "\t" + "contig: " +  NowContig
    PreContig = rec.id
    for gene_f in rec.features:
        for mRNA_f in gene_f.sub_features:
            COUNT = 0 #新しいmRNAに入ったらcountを0にする
            out_STRAND=""
            out_STRAND_CLOSE=""
            POSITION="" #各出力項目を初期化
            out_JOINT = ""
            out_JOINT_CLOSE=""
            strand = mRNA_f.strand
            if strand == -1:
                out_STRAND = "complement("
                out_STRAND_CLOSE = ")"
            ####GENE_INFORMATIONS
            mRNA_ID = mRNA_f.qualifiers["ID"]
            locus_tag_ID = mRNA_f.qualifiers["Note"]
            product_name = mRNA_f.qualifiers.get("product", ["Unknown_product"])
            ####
            for CDS_f in mRNA_f.sub_features:
                COUNT += 1
                transl_table = CDS_f.qualifiers.get("transl_table", ["1"])
                if COUNT==1: #該当mRNAにおける最初のCDS
                    CDS_START = CDS_f.location.start +1
                    CDS_END = CDS_f.location.end
                    POSITION = POSITION + str(CDS_START) + ".." + str(CDS_END)
                else: #該当mRNAにおけ二番目以降のCDS
                    CDS_START = CDS_f.location.start +1
                    CDS_END = CDS_f.location.end
                    POSITION = POSITION + ","+ str(CDS_START) + ".." + str(CDS_END)
                    out_JOINT = "join("
                    out_JOINT_CLOSE = ")"
            print "\tCDS\t"+ out_STRAND + out_JOINT + POSITION + out_JOINT_CLOSE + out_STRAND_CLOSE + "\tcodon_start\t1"
            print "\t\t\t" + "locus_tag\t" + locus_tag_ID[0]
            print "\t\t\t" + "note\t" + mRNA_ID[0]
            print "\t\t\t" + "product\t" + product_name[0]
            print "\t\t\t" + "transl_table\t" + transl_table[0]
    Contig_Count += 1
    if Contig_Count == len(position):
        break
in_handle.close()
example.gff
##sequence-region unitig_0 1 307079
unitig_0    .   gene    1137    4305    .   -   .   ID=LOCUS_TAG_0000100;Note=g35293
unitig_0    .   mRNA    1137    4305    .   -   .   ID=g35293.t1;Note=LOCUS_TAG_0000100;Parent=LOCUS_TAG_0000100;product=hypothetical protein
unitig_0    .   CDS 1137    1462    .   -   1   ID=g35293.t1_1;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0    .   CDS 2201    2583    .   -   1   ID=g35293.t1_2;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0    .   CDS 2900    3031    .   -   1   ID=g35293.t1_3;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0    .   CDS 3381    3597    .   -   1   ID=g35293.t1_4;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0    .   CDS 3666    4073    .   -   1   ID=g35293.t1_5;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0    .   CDS 4278    4305    .   -   1   ID=g35293.t1_6;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0    .   gene    4844    5880    .   -   .   ID=LOCUS_TAG_0000200;Note=g35294
unitig_0    .   mRNA    4844    5880    .   -   .   ID=g35294.t1;Note=LOCUS_TAG_0000200;Parent=LOCUS_TAG_0000200;product=hypothetical protein
unitig_0    .   CDS 4844    5544    .   -   1   ID=g35294.t1_1;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0    .   CDS 5628    5880    .   -   1   ID=g35294.t1_2;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0    .   gene    6030    6836    .   -   .   ID=LOCUS_TAG_0000300;Note=g35295
unitig_0    .   mRNA    6030    6836    .   -   .   ID=g35295.t1;Note=LOCUS_TAG_0000300;Parent=LOCUS_TAG_0000300;product=hypothetical protein
unitig_0    .   CDS 6030    6335    .   -   1   ID=g35295.t1_1;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0    .   CDS 6427    6468    .   -   1   ID=g35295.t1_2;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0    .   CDS 6539    6739    .   -   1   ID=g35295.t1_3;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0    .   CDS 6819    6836    .   -   1   ID=g35295.t1_4;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0    .   gene    13014   14367   .   +   .   ID=LOCUS_TAG_0000400;Note=g35296
unitig_0    .   mRNA    13014   14367   .   +   .   ID=g35296.t1;Note=LOCUS_TAG_0000400;Parent=LOCUS_TAG_0000400;product=hypothetical protein
unitig_0    .   CDS 13014   13016   .   +   1   ID=g35296.t1_1;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0    .   CDS 13106   13201   .   +   1   ID=g35296.t1_2;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0    .   CDS 13271   13316   .   +   1   ID=g35296.t1_3;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0    .   CDS 13439   13638   .   +   1   ID=g35296.t1_4;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0    .   CDS 13716   13893   .   +   1   ID=g35296.t1_5;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0    .   CDS 13961   14119   .   +   1   ID=g35296.t1_6;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0    .   CDS 14186   14367   .   +   1   ID=g35296.t1_7;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287