0
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

[Biopython/ChatGPT]アミノ酸配列を所定のGC含量に近いDNA配列へ逆翻訳する

Posted at

ChatGPT (GPT-4) にBiopythonを使ったスクリプトを書かせる試みとして、タンパク質をDNAに逆翻訳するプログラムを書かせた。
どうにか動くスクリプトを完成させることができたが、プロンプトの書き方にコツがあるようだ。
要件定義をしっかりすることが重要だと再認識した。

back_translate_proteins.py
#!/usr/bin/env python

import argparse
import sys
import os
from Bio import SeqIO, Seq
from Bio.Data import CodonTable
import tqdm
import logging

logging.basicConfig(level=logging.INFO, stream=sys.stderr)

def calculate_gc_content(sequence):
    """Compute the GC content of a given sequence."""
    return (sequence.count('G') + sequence.count('C')) / max(1, len(sequence))


def heuristic_back_translate(sequence, protein_to_codon, desired_gc):
    nucleotide_seq = ""

    for aa in sequence:
        if aa not in protein_to_codon:
            continue

        current_gc = calculate_gc_content(nucleotide_seq)
        codons = sorted(protein_to_codon[aa], key=calculate_gc_content)

        if current_gc < desired_gc:
            nucleotide_seq += codons[-1]
        else:
            nucleotide_seq += codons[0]

    resulting_gc = calculate_gc_content(nucleotide_seq)
    return nucleotide_seq, resulting_gc
    
def calculate_min_max_gc(sequence, protein_to_codon):
    min_gc_seq = ''.join(sorted(protein_to_codon[aa], key=calculate_gc_content)[0] for aa in sequence if aa in protein_to_codon)
    max_gc_seq = ''.join(sorted(protein_to_codon[aa], key=calculate_gc_content, reverse=True)[0] for aa in sequence if aa in protein_to_codon)

    return calculate_gc_content(min_gc_seq), calculate_gc_content(max_gc_seq)

def process_record(record, protein_to_codon, desired_gc, file_format, output_file, gc_adjusted_count, tsv_report):
    min_gc, max_gc = calculate_min_max_gc(str(record.seq), protein_to_codon)
    current_desired_gc = desired_gc
    note = ""

    if current_desired_gc < min_gc:
        current_desired_gc = min_gc
        note = "desired GC% unreachable; adjusted to theoretical min"
        gc_adjusted_count[0] += 1
    elif current_desired_gc > max_gc:
        current_desired_gc = max_gc
        note = "desired GC% unreachable; adjusted to theoretical max"
        gc_adjusted_count[0] += 1

    translated_seq, resulting_gc = heuristic_back_translate(str(record.seq), protein_to_codon, current_desired_gc)

    record.seq = Seq.Seq(translated_seq)
    record.description += f" | Resulting GC: {resulting_gc*100:.2f}%"

    with open(output_file, 'a') as outfile:
        SeqIO.write(record, outfile, file_format)

    tsv_report.append([record.id, len(record.seq), min_gc * 100, max_gc * 100, resulting_gc * 100, note])

    return min_gc, max_gc, resulting_gc

def main(input_file, output_file, file_format, genetic_table_id, desired_gc, report_file):
    table = CodonTable.unambiguous_dna_by_id[genetic_table_id]
    protein_to_codon = {value: [key for key in table.forward_table if table.forward_table[key] == value] for value in table.forward_table.values()}

    sum_min_gc = 0
    sum_max_gc = 0
    sum_actual_gc = 0
    total_proteins = 0
    gc_adjusted_count = [0]
    tsv_report = []

    records = list(SeqIO.parse(input_file, file_format))

    for record in tqdm.tqdm(records, desc="Processing records"):
        min_gc, max_gc, actual_gc = process_record(record, protein_to_codon, desired_gc/100, file_format, output_file, gc_adjusted_count, tsv_report)

        sum_min_gc += min_gc
        sum_max_gc += max_gc
        sum_actual_gc += actual_gc
        total_proteins += 1

    avg_min_gc = (sum_min_gc / total_proteins) * 100
    avg_max_gc = (sum_max_gc / total_proteins) * 100
    avg_actual_gc = (sum_actual_gc / total_proteins) * 100

    logging.info(f"Summary:\nTotal proteins back translated: {total_proteins}\nAverage theoretical minimum GC: {avg_min_gc:.2f}%\nAverage theoretical maximum GC: {avg_max_gc:.2f}%\nAverage actual GC% of the backtranslated sequences: {avg_actual_gc:.2f}%")

    if gc_adjusted_count[0] > 0:
        percentage = (gc_adjusted_count[0] / total_proteins) * 100
        logging.warning(f"{gc_adjusted_count[0]} sequences ({percentage:.2f}%) were adjusted to fit the theoretical GC bounds.")

    with open(report_file, 'w') as rpt_file:
        rpt_file.write("id\tlength\ttheoretical_min_gc\ttheoretical_max_gc\tactual_gc\tnote\n")
        for line in tsv_report:
            rpt_file.write('\t'.join(map(str, line)) + '\n')

if __name__ == "__main__":
    default_output_name = None
    default_report_name = None
    parser = argparse.ArgumentParser(description="Back-translate protein sequences into nucleotide sequences.")
    parser.add_argument('-o', '--output', default=default_output_name, help=f'Output nucleotide sequence file. Default: {default_output_name}')
    parser.add_argument('-f', '--format', default="fasta", help='File format (default: fasta).')
    parser.add_argument('-t', '--table', type=int, default=1, help='Genetic table ID (default: 1).')
    parser.add_argument('--gc', type=float, default=50, help='Desired GC content (default: 50%%).')
    parser.add_argument('-r', '--report', default=default_report_name, help=f'Report file in TSV format detailing backtranslation results. Default: {default_report_name}')
    parser.add_argument('-i', '--input', required=True, help='Input protein sequence file.')
    args, _ = parser.parse_known_args()
    if args.input:
        base_name = os.path.splitext(os.path.basename(args.input))[0]
        default_output_name = f"{base_name}.fna"
        default_report_name = f"{base_name}_report.tsv"
        parser.set_defaults(output=default_output_name, report=default_report_name)
    if args.output:
        base_name = os.path.splitext(os.path.basename(args.output))[0]
        default_report_name = f"{base_name}_report.tsv"
        parser.set_defaults(report=default_report_name)
    args = parser.parse_args()

    main(args.input, args.output, args.format, args.table, args.gc, args.report)
$ ./back_translate_proteins.py -h
usage: back_translate_proteins.py [-h] [-o OUTPUT] [-f FORMAT] [-t TABLE] [--gc GC] [-r REPORT] -i INPUT

Back-translate protein sequences into nucleotide sequences.

optional arguments:
  -h, --help            show this help message and exit
  -o OUTPUT, --output OUTPUT
                        Output nucleotide sequence file. Default: None
  -f FORMAT, --format FORMAT
                        File format (default: fasta).
  -t TABLE, --table TABLE
                        Genetic table ID (default: 1).
  --gc GC               Desired GC content (default: 50%).
  -r REPORT, --report REPORT
                        Report file in TSV format detailing backtranslation results. Default: None
  -i INPUT, --input INPUT
                        Input protein sequence file.
$ ./back_translate_proteins.py -i GCF_000005845.2_ASM584v2_protein.faa -o GCF_000005845.2_ASM584v2_protein.gc55.fna -t 11 --gc 55
Processing records: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4298/4298 [00:16<00:00, 260.77it/s]
INFO:root:Summary:
Total proteins back translated: 4298
Average theoretical minimum GC: 30.01%
Average theoretical maximum GC: 67.23%
Average actual GC% of the backtranslated sequences: 54.86%
WARNING:root:37 sequences (0.86%) were adjusted to fit the theoretical GC bounds

$ ./back_translate_proteins.py -i GCF_000005845.2_ASM584v2_protein.faa -o GCF_000005845.2_ASM584v2_protein.gc45.fna -t 11 --gc 45
Processing records: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4298/4298 [00:15<00:00, 283.89it/s]
INFO:root:Summary:
Total proteins back translated: 4298
Average theoretical minimum GC: 30.01%
Average theoretical maximum GC: 67.23%
Average actual GC% of the backtranslated sequences: 45.04%

$ ./back_translate_proteins.py -i GCF_000005845.2_ASM584v2_protein.faa -o GCF_000005845.2_ASM584v2_protein.gc35.fna -t 11 --gc 35
Processing records: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4298/4298 [00:14<00:00, 298.23it/s]
INFO:root:Summary:
Total proteins back translated: 4298
Average theoretical minimum GC: 30.01%
Average theoretical maximum GC: 67.23%
Average actual GC% of the backtranslated sequences: 35.34%
WARNING:root:77 sequences (1.79%) were adjusted to fit the theoretical GC bounds.
GCF_000005845.2_ASM584v2_protein.gc55.fna
>NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 57.14%
ATGAAGCGGATTAGCACGACGATCACGACGACGATCACGATCACGACGGGGAATGGGGCT
GGT
>NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 55.00%
ATGCGGGTTTTAAAGTTCGGGGGGACGTCTGTGGCGAATGCGGAAAGATTCCTGCGGGTT
GCTGATATCCTGGAGAGCAATGCGCGGCAAGGTCAAGTTGCGACTGTTCTGAGCGCTCCT
GCTAAAATCACGAACCACCTGGTGGCGATGATTGAGAAGACGATCAGCGGGCAGGATGCG
TTACCGAATATCAGCGACGCGGAACGGATTTTCGCGGAACTGCTGACTGGGTTAGCGGCT
GCTCAACCTGGTTTTCCGTTAGCGCAATTAAAGACGTTCGTGGACCAGGAGTTCGCGCAG
ATTAAGCACGTGCTGCACGGGATTAGCCTGCTGGGGCAATGTCCGGATTCTATCAACGCG
GCTTTAATCTGCCGGGGGGAAAAAATGAGCATCGCGATCATGGCGGGGGTTCTGGAAGCG
AGAGGTCATAATGTGACGGTGATCGACCCGGTTGAGAAACTGCTGGCGGTTGGGCATTAT
CTGGAGAGCACTGTGGACATTGCGGAGTCTACGCGGAGAATTGCGGCTTCTCGGATTCCG
GCTGATCATATGGTGCTGATGGCGGGTTTTACGGCGGGTAATGAGAAAGGGGAGTTAGTG
GTGCTGGGGAGAAATGGGTCTGACTACAGCGCGGCTGTTTTAGCGGCTTGTCTGAGAGCG
GATTGTTGCGAAATCTGGACGGACGTGGATGGGGTTTATACGTGCGACCCGAGACAAGTG
CCTGATGCGAGATTACTGAAGAGCATGAGCTACCAGGAGGCGATGGAGCTGAGCTATTTC
GGGGCGAAAGTGCTGCACCCTAGAACGATTACGCCGATTGCGCAGTTTCAGATCCCGTGC
TTAATCAAGAACACGGGGAACCCGCAGGCGCCTGGTACTTTAATCGGGGCGTCTAGAGAC
GAAGACGAGTTACCGGTGAAAGGGATCAGCAACCTGAACAACATGGCGATGTTCAGCGTG
AGCGGGCCGGGTATGAAAGGGATGGTGGGTATGGCGGCTAGAGTTTTTGCGGCGATGTCT
AGAGCGAGAATTAGCGTGGTGCTGATCACGCAGAGCTCTAGCGAGTATAGCATCAGCTTC
TGCGTGCCGCAAAGCGACTGTGTGCGGGCTGAAAGAGCTATGCAGGAAGAGTTCTACCTG
GAGCTGAAGGAGGGGTTACTGGAGCCGTTAGCGGTTACTGAGCGGTTAGCGATTATCAGC
GTGGTGGGGGATGGTATGAGAACGCTGAGAGGGATTAGCGCGAAATTCTTCGCGGCGTTA
GCGAGAGCGAATATCAACATCGTGGCGATCGCGCAGGGTTCTAGCGAACGGTCTATTAGC
GTGGTGGTGAATAACGACGACGCGACGACTGGGGTTAGAGTGACGCATCAGATGCTGTTC
AACACGGACCAGGTGATCGAGGTGTTCGTGATCGGGGTTGGGGGTGTTGGTGGTGCTTTA
TTAGAGCAGCTGAAGCGGCAACAGAGCTGGTTAAAGAACAAGCACATCGACCTGCGGGTG
TGCGGGGTTGCTAATAGCAAGGCGCTGTTAACGAACGTGCACGGGTTAAACCTGGAGAAC
TGGCAGGAGGAGCTGGCGCAAGCTAAAGAGCCGTTTAACCTGGGGAGACTGATCCGGTTA
GTGAAGGAGTACCACCTGCTGAACCCGGTTATCGTGGACTGCACGTCTAGCCAGGCTGTT
GCTGATCAGTATGCGGACTTTCTGCGGGAAGGGTTTCACGTGGTGACTCCGAATAAGAAG
GCGAACACGAGCAGCATGGACTACTACCACCAGCTGCGGTATGCGGCTGAAAAGAGCCGG
AGAAAATTCCTGTACGACACGAACGTGGGGGCGGGTTTACCGGTTATTGAGAACCTGCAG
AACCTGCTGAACGCGGGGGATGAACTGATGAAGTTCAGCGGGATCCTGAGCGGGTCTTTA
AGCTACATCTTCGGGAAGCTGGACGAGGGGATGAGCTTTAGCGAGGCGACTACGTTAGCG
AGAGAGATGGGGTATACGGAGCCGGATCCTAGAGACGATCTGAGCGGTATGGACGTTGCG
AGAAAACTGCTGATCCTGGCGAGAGAGACTGGGAGAGAACTGGAGTTAGCGGACATTGAG
ATCGAGCCGGTTCTGCCGGCTGAATTTAACGCGGAGGGTGATGTGGCTGCTTTTATGGCG
AATCTGAGCCAGCTGGACGATCTGTTCGCGGCTAGAGTTGCGAAAGCGAGAGACGAAGGG
AAAGTGCTGAGATACGTGGGGAATATCGACGAGGACGGGGTTTGCAGAGTGAAGATCGCG
GAAGTGGACGGTAATGACCCGTTATTCAAGGTGAAGAACGGGGAGAACGCGCTGGCGTTT
TACAGCCACTACTACCAGCCGCTGCCGTTAGTGCTGAGAGGGTATGGGGCTGGTAATGAC
GTTACGGCGGCTGGTGTTTTTGCGGATTTACTGCGGACTCTGAGCTGGAAACTGGGGGTT
...
GCF_000005845.2_ASM584v2_protein.gc45.fna
>NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 49.21%
ATGAAGCGGATTAGCACTACGATTACGACGACTATTACGATCACGACGGGTAATGGGGCT
GGT
>NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 45.04%
ATGCGGGTTTTAAAGTTCGGGGGTACTTCTGTTGCTAATGCGGAAAGATTTCTGCGGGTT
GCTGATATTCTGGAATCTAACGCGAGACAAGGTCAAGTTGCTACTGTTTTAAGCGCGCCT
GCTAAAATTACGAATCACCTGGTTGCTATGATTGAGAAGACGATTAGCGGGCAAGATGCT
TTACCGAATATCAGCGATGCGGAAAGAATTTTCGCGGAATTACTGACGGGTTTAGCGGCT
GCTCAACCTGGTTTTCCTTTAGCTCAATTAAAAACGTTCGTGGATCAGGAATTTGCGCAA
ATTAAGCACGTGCTGCATGGGATTTCTCTGTTAGGGCAATGTCCTGATTCTATTAACGCG
GCTTTAATCTGCCGGGGTGAAAAAATGAGCATTGCGATTATGGCGGGTGTTTTAGAGGCT
AGAGGTCATAATGTGACTGTGATTGACCCGGTTGAAAAACTGTTAGCGGTTGGTCATTAT
CTGGAGTCTACGGTTGATATCGCGGAATCTACTAGACGGATTGCTGCTTCTAGAATTCCG
GCTGATCATATGGTTTTAATGGCGGGTTTTACGGCTGGTAATGAAAAGGGGGAATTAGTG
GTTCTGGGTAGAAATGGGTCTGATTATAGCGCGGCTGTTTTAGCTGCTTGTTTACGGGCT
GATTGTTGTGAAATTTGGACGGATGTTGACGGTGTTTATACGTGTGACCCTAGACAAGTT
CCTGATGCTAGATTACTGAAGAGCATGAGCTATCAGGAGGCTATGGAATTAAGCTACTTC
GGGGCTAAAGTGTTACACCCGAGAACTATTACGCCTATTGCGCAATTTCAGATTCCGTGT
TTAATCAAGAACACGGGGAACCCGCAAGCTCCTGGTACTTTAATTGGGGCTTCTAGAGAT
GAAGATGAACTGCCTGTTAAAGGGATTAGCAATCTGAACAACATGGCGATGTTTAGCGTG
TCTGGGCCTGGTATGAAAGGTATGGTTGGTATGGCTGCTAGAGTTTTTGCTGCTATGTCT
AGAGCTAGAATTAGCGTTGTGTTAATCACGCAGTCTAGCTCTGAATACAGCATTAGCTTC
TGCGTGCCTCAATCTGATTGCGTTAGAGCGGAAAGAGCTATGCAAGAAGAATTTTACCTG
GAGCTGAAAGAGGGGTTACTGGAACCTTTAGCGGTTACTGAAAGACTGGCTATTATCAGC
GTTGTGGGTGATGGTATGAGAACTTTACGGGGTATTAGCGCTAAATTCTTCGCGGCTTTA
GCTAGAGCTAATATCAACATCGTGGCGATTGCGCAAGGTTCTTCTGAAAGATCTATCAGC
GTTGTGGTTAACAACGACGACGCTACTACTGGTGTTAGAGTTACTCACCAAATGCTGTTT
AACACGGACCAGGTTATTGAGGTGTTTGTGATCGGGGTTGGTGGTGTTGGTGGTGCTTTA
TTAGAACAACTGAAACGGCAACAATCTTGGTTAAAGAACAAGCACATCGACCTGCGGGTT
TGTGGTGTTGCTAATTCTAAGGCGTTACTGACTAATGTGCACGGTTTAAACCTGGAGAAT
TGGCAGGAAGAGTTAGCGCAAGCTAAAGAGCCTTTTAACCTGGGTAGATTAATCCGGTTA
GTGAAGGAGTACCACTTACTGAACCCGGTTATTGTGGACTGTACTAGCTCTCAAGCGGTT
GCTGATCAATATGCGGATTTTCTGAGAGAGGGTTTTCACGTTGTTACGCCTAATAAGAAG
GCGAATACGAGCTCTATGGACTATTACCACCAGTTACGGTATGCGGCTGAAAAAAGCAGA
AGAAAGTTCCTGTACGACACGAATGTGGGGGCTGGTTTACCTGTTATTGAAAATCTGCAG
AATCTGCTGAACGCGGGTGATGAATTAATGAAGTTCAGCGGGATTCTGTCTGGTTCTTTA
AGCTACATCTTCGGGAAACTGGACGAAGGGATGTCTTTTAGCGAAGCGACTACTTTAGCG
AGAGAAATGGGGTATACTGAGCCTGATCCTAGAGATGATTTAAGCGGGATGGATGTTGCT
AGAAAACTGCTGATTCTGGCGAGAGAAACTGGTAGAGAATTAGAGCTGGCTGATATTGAG
ATTGAGCCGGTTTTACCGGCTGAATTTAACGCGGAAGGTGATGTTGCTGCTTTTATGGCT
AATCTGTCTCAGTTAGACGACTTATTCGCGGCTAGAGTTGCTAAAGCGAGAGATGAAGGT
AAAGTGTTACGGTATGTGGGGAATATTGACGAGGATGGGGTTTGTAGAGTTAAAATCGCG
GAAGTGGATGGGAATGATCCGTTATTTAAGGTGAAGAACGGGGAAAACGCGTTAGCGTTT
TATAGCCACTACTACCAGCCGTTACCTTTAGTGTTACGGGGTTATGGGGCTGGTAATGAT
GTTACTGCTGCTGGTGTTTTTGCTGATTTACTGAGAACGTTAAGCTGGAAACTGGGGGTT
...
GCF_000005845.2_ASM584v2_protein.gc35.fna
>NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 39.68%
ATGAAGCGGATTTCTACTACTATTACGACTACTATTACGATTACGACTGGTAATGGTGCT
GGT
>NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 35.08%
ATGCGGGTTTTAAAATTCGGGGGTACTTCTGTTGCTAATGCTGAAAGATTTTTAAGAGTT
GCTGATATTTTAGAGTCTAATGCGAGACAAGGTCAAGTTGCTACTGTTTTATCTGCTCCT
GCTAAAATTACTAATCATTTAGTTGCTATGATTGAAAAAACGATTAGCGGTCAAGATGCT
TTACCTAATATTAGCGATGCTGAAAGAATTTTTGCGGAATTATTAACGGGTTTAGCTGCT
GCTCAACCTGGTTTTCCTTTAGCTCAATTAAAAACTTTTGTTGATCAAGAATTTGCTCAA
ATTAAGCACGTTTTACACGGTATTTCTTTACTGGGTCAATGTCCTGATTCTATTAATGCT
GCTTTAATTTGCAGAGGTGAAAAAATGTCTATTGCGATTATGGCTGGTGTTTTAGAAGCT
AGAGGTCATAATGTTACTGTTATTGATCCTGTTGAAAAATTACTGGCTGTTGGTCATTAT
TTAGAGTCTACTGTTGATATTGCGGAATCTACTAGAAGAATTGCTGCTTCTAGAATTCCT
GCTGATCATATGGTTTTAATGGCTGGTTTTACTGCTGGTAATGAAAAAGGTGAATTAGTT
GTTTTAGGTAGAAATGGTTCTGATTATTCTGCTGCTGTTTTAGCTGCTTGTTTAAGAGCT
GATTGTTGTGAAATTTGGACTGATGTTGATGGTGTTTATACTTGTGATCCTAGACAAGTT
CCTGATGCTAGATTATTAAAATCTATGTCTTATCAAGAAGCTATGGAATTATCTTATTTC
GGGGCTAAAGTTTTACACCCTAGAACTATTACTCCTATTGCTCAATTTCAAATTCCGTGT
TTAATCAAGAACACGGGTAATCCTCAAGCTCCTGGTACTTTAATTGGTGCTTCTAGAGAT
GAAGATGAATTACCTGTTAAAGGTATTTCTAATTTAAATAACATGGCGATGTTTTCTGTT
TCTGGGCCTGGTATGAAAGGTATGGTTGGTATGGCTGCTAGAGTTTTTGCTGCTATGTCT
AGAGCTAGAATTTCTGTTGTTTTAATTACTCAATCTTCTTCTGAATATTCTATTTCTTTT
TGTGTTCCTCAATCTGATTGTGTTAGAGCTGAAAGAGCTATGCAAGAAGAATTTTATTTA
GAACTGAAAGAGGGTTTATTAGAGCCTTTAGCTGTTACTGAAAGATTAGCTATTATTAGC
GTTGTGGGTGATGGTATGAGAACTTTAAGAGGTATTTCTGCTAAATTTTTTGCGGCTTTA
GCTAGAGCTAATATTAATATCGTGGCTATTGCTCAAGGTTCTTCTGAAAGATCTATTTCT
GTTGTTGTTAATAACGACGATGCTACTACTGGTGTTAGAGTTACTCATCAAATGTTATTT
AATACGGACCAAGTTATTGAGGTTTTTGTGATTGGGGTTGGTGGTGTTGGTGGTGCTTTA
TTAGAACAATTAAAAAGACAACAATCTTGGTTAAAAAATAAGCACATTGACTTACGGGTT
TGTGGTGTTGCTAATTCTAAAGCTTTATTAACGAATGTGCATGGTTTAAATCTGGAAAAT
TGGCAGGAAGAATTAGCGCAAGCTAAAGAACCTTTTAATTTAGGGAGATTAATTCGGTTA
GTTAAGGAGTATCACTTACTGAATCCGGTTATTGTTGATTGTACTTCTTCTCAAGCTGTT
GCTGATCAATATGCTGATTTTTTACGGGAAGGTTTTCATGTTGTTACTCCTAATAAAAAA
GCGAATACGTCTTCTATGGATTATTACCACCAATTACGGTATGCTGCTGAAAAATCTAGA
AGAAAATTCCTGTATGACACTAATGTGGGTGCTGGTTTACCTGTTATTGAAAATTTACAA
AATCTGCTGAATGCGGGTGATGAATTAATGAAATTTAGCGGTATTTTAAGCGGTTCTTTA
TCTTACATCTTCGGGAAATTAGACGAAGGTATGTCTTTTTCTGAAGCTACTACTTTAGCT
AGAGAAATGGGTTATACTGAACCTGATCCTAGAGATGATTTATCTGGTATGGATGTTGCT
AGAAAATTATTAATCCTGGCTAGAGAAACTGGTAGAGAATTAGAATTAGCTGATATTGAA
ATTGAGCCGGTTTTACCTGCTGAATTTAATGCTGAAGGTGATGTTGCTGCTTTTATGGCT
AATTTATCTCAATTAGATGATTTATTCGCGGCTAGAGTTGCTAAAGCTAGAGATGAAGGT
AAAGTTTTAAGATATGTTGGTAATATTGACGAAGATGGTGTTTGTAGAGTTAAAATTGCG
GAAGTTGATGGTAATGATCCTTTATTTAAGGTGAAAAACGGGGAAAATGCTTTAGCTTTT
TATAGCCACTATTACCAGCCTTTACCTTTAGTTTTACGGGGTTATGGTGCTGGTAATGAT
GTTACTGCTGCTGGTGTTTTTGCTGATTTATTAAGAACTTTATCTTGGAAATTAGGTGTT
...
GCF_000005845.2_ASM584v2_protein.gc35_report.tsv
id      length  theoretical_min_gc      theoretical_max_gc      actual_gc       note
NP_414542.1     63      30.158730158730158      63.49206349206349       39.682539682539684
NP_414543.1     2460    30.48780487804878       68.21138211382114       35.081300813008134
NP_414544.1     930     33.01075268817204       70.0    35.16129032258065
NP_414545.1     1284    29.906542056074763      67.91277258566979       35.046728971962615
NP_414546.1     294     37.07482993197279       72.10884353741497       40.476190476190474      desired GC% unreachable; adjusted to theoretical min
NP_414547.1     774     25.839793281653744      64.08268733850129       35.012919896640824
NP_414548.1     1428    31.72268907563025       67.78711484593838       35.22408963585434
NP_414549.1     951     29.02208201892745       66.5615141955836        34.910620399579386
NP_414550.1     585     32.30769230769231       70.25641025641025       35.72649572649573
...
GCF_000005845.2_ASM584v2_protein.gc45_report.tsv
id      length  theoretical_min_gc      theoretical_max_gc      actual_gc       note
NP_414542.1     63      30.158730158730158      63.49206349206349       49.2063492063492
NP_414543.1     2460    30.48780487804878       68.21138211382114       45.040650406504064
NP_414544.1     930     33.01075268817204       70.0    44.946236559139784
NP_414545.1     1284    29.906542056074763      67.91277258566979       45.01557632398754
NP_414546.1     294     37.07482993197279       72.10884353741497       45.23809523809524
NP_414547.1     774     25.839793281653744      64.08268733850129       44.96124031007752
NP_414548.1     1428    31.72268907563025       67.78711484593838       45.02801120448179
NP_414549.1     951     29.02208201892745       66.5615141955836        44.900105152471085
NP_414550.1     585     32.30769230769231       70.25641025641025       44.95726495726496
...
GCF_000005845.2_ASM584v2_protein.gc55_report.tsv
id      length  theoretical_min_gc      theoretical_max_gc      actual_gc       note
NP_414542.1     63      30.158730158730158      63.49206349206349       57.14285714285714
NP_414543.1     2460    30.48780487804878       68.21138211382114       55.00000000000001
NP_414544.1     930     33.01075268817204       70.0    54.83870967741935
NP_414545.1     1284    29.906542056074763      67.91277258566979       54.82866043613706
NP_414546.1     294     37.07482993197279       72.10884353741497       55.44217687074829
NP_414547.1     774     25.839793281653744      64.08268733850129       55.03875968992248
NP_414548.1     1428    31.72268907563025       67.78711484593838       54.971988795518214
NP_414549.1     951     29.02208201892745       66.5615141955836        54.99474237644585
NP_414550.1     585     32.30769230769231       70.25641025641025       55.042735042735046
...
0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?