LoginSignup
1
1

Convert GFF3 to MSS for DDBJ submission

Last updated at Posted at 2017-08-05

ゲノムアノテーション情報の記述に良く使われるGFF3形式を、DDBJへのゲノミックデータと登録に要求されるアノテーション方式(MSS)に変換するためのスクリプトを作成しました。

GFF2MSS.py genome.gff

という形で使って下さい。

###########
2023/05/17追記
tDNA, rDNA, N-gap に対応したバージョンをgithubにあげていますので、こちらをご利用ください。

############

バグ取りが完全ではないので使用注意です。
タンパク質コード領域に対応しています。
登録者情報などを記述する部分(COMMON領域)は手打ちして下さい。

下記に想定しているGFF3ファイルの例を置きます
https://github.com/billzt/gff3sort
で処理したりすると良いです

GFF2MSS.py
#!/usr/bin/python
# coding: UTF-8



import sys
from Bio import SeqIO
from Bio import Seq
from BCBio import GFF

args = sys.argv
in_file = args[1]
PreContig = ""
Contig_Count = 0


in_handle = open(in_file)
for rec in GFF.parse(in_handle):
    NowContig = rec.id
    position = rec.annotations["sequence-region"]
    NowPosition = position[Contig_Count]
    NowPosEnd = str(NowPosition[2])
    if PreContig != NowContig:
        print NowContig + "\t" + "source" + "\t" + str(1) + ".." + NowPosEnd + "\t" + "ff_definition" + "\t" + "@@[organism]@@ DNA, contig: " + NowContig
        print "\t" + "\t" + "\t" + "note" + "\t" + "contig: " +  NowContig
    PreContig = rec.id
    for gene_f in rec.features:
        for mRNA_f in gene_f.sub_features:
            COUNT = 0 #新しいmRNAに入ったらcountを0にする
            out_STRAND=""
            out_STRAND_CLOSE=""
            POSITION="" #各出力項目を初期化
            out_JOINT = ""
            out_JOINT_CLOSE=""
            strand = mRNA_f.strand
            if strand == -1:
                out_STRAND = "complement("
                out_STRAND_CLOSE = ")"
            ####GENE_INFORMATIONS
            mRNA_ID = mRNA_f.qualifiers["ID"]
            locus_tag_ID = mRNA_f.qualifiers["Note"]
            product_name = mRNA_f.qualifiers.get("product", ["Unknown_product"])
            ####
            for CDS_f in mRNA_f.sub_features:
                COUNT += 1
                transl_table = CDS_f.qualifiers.get("transl_table", ["1"])
                if COUNT==1: #該当mRNAにおける最初のCDS
                    CDS_START = CDS_f.location.start +1
                    CDS_END = CDS_f.location.end
                    POSITION = POSITION + str(CDS_START) + ".." + str(CDS_END)
                else: #該当mRNAにおけ二番目以降のCDS
                    CDS_START = CDS_f.location.start +1
                    CDS_END = CDS_f.location.end
                    POSITION = POSITION + ","+ str(CDS_START) + ".." + str(CDS_END)
                    out_JOINT = "join("
                    out_JOINT_CLOSE = ")"
            print "\tCDS\t"+ out_STRAND + out_JOINT + POSITION + out_JOINT_CLOSE + out_STRAND_CLOSE + "\tcodon_start\t1"
            print "\t\t\t" + "locus_tag\t" + locus_tag_ID[0]
            print "\t\t\t" + "note\t" + mRNA_ID[0]
            print "\t\t\t" + "product\t" + product_name[0]
            print "\t\t\t" + "transl_table\t" + transl_table[0]
    Contig_Count += 1
    if Contig_Count == len(position):
        break
in_handle.close()
example.gff
##sequence-region unitig_0 1 307079
unitig_0	.	gene	1137	4305	.	-	.	ID=LOCUS_TAG_0000100;Note=g35293
unitig_0	.	mRNA	1137	4305	.	-	.	ID=g35293.t1;Note=LOCUS_TAG_0000100;Parent=LOCUS_TAG_0000100;product=hypothetical protein
unitig_0	.	CDS	1137	1462	.	-	1	ID=g35293.t1_1;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0	.	CDS	2201	2583	.	-	1	ID=g35293.t1_2;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0	.	CDS	2900	3031	.	-	1	ID=g35293.t1_3;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0	.	CDS	3381	3597	.	-	1	ID=g35293.t1_4;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0	.	CDS	3666	4073	.	-	1	ID=g35293.t1_5;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0	.	CDS	4278	4305	.	-	1	ID=g35293.t1_6;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0	.	gene	4844	5880	.	-	.	ID=LOCUS_TAG_0000200;Note=g35294
unitig_0	.	mRNA	4844	5880	.	-	.	ID=g35294.t1;Note=LOCUS_TAG_0000200;Parent=LOCUS_TAG_0000200;product=hypothetical protein
unitig_0	.	CDS	4844	5544	.	-	1	ID=g35294.t1_1;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0	.	CDS	5628	5880	.	-	1	ID=g35294.t1_2;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0	.	gene	6030	6836	.	-	.	ID=LOCUS_TAG_0000300;Note=g35295
unitig_0	.	mRNA	6030	6836	.	-	.	ID=g35295.t1;Note=LOCUS_TAG_0000300;Parent=LOCUS_TAG_0000300;product=hypothetical protein
unitig_0	.	CDS	6030	6335	.	-	1	ID=g35295.t1_1;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0	.	CDS	6427	6468	.	-	1	ID=g35295.t1_2;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0	.	CDS	6539	6739	.	-	1	ID=g35295.t1_3;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0	.	CDS	6819	6836	.	-	1	ID=g35295.t1_4;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0	.	gene	13014	14367	.	+	.	ID=LOCUS_TAG_0000400;Note=g35296
unitig_0	.	mRNA	13014	14367	.	+	.	ID=g35296.t1;Note=LOCUS_TAG_0000400;Parent=LOCUS_TAG_0000400;product=hypothetical protein
unitig_0	.	CDS	13014	13016	.	+	1	ID=g35296.t1_1;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0	.	CDS	13106	13201	.	+	1	ID=g35296.t1_2;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0	.	CDS	13271	13316	.	+	1	ID=g35296.t1_3;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0	.	CDS	13439	13638	.	+	1	ID=g35296.t1_4;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0	.	CDS	13716	13893	.	+	1	ID=g35296.t1_5;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0	.	CDS	13961	14119	.	+	1	ID=g35296.t1_6;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0	.	CDS	14186	14367	.	+	1	ID=g35296.t1_7;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1