ChatGPT (GPT-4) にBiopythonを使ったスクリプトを書かせる試みとして、タンパク質をDNAに逆翻訳するプログラムを書かせた。
どうにか動くスクリプトを完成させることができたが、プロンプトの書き方にコツがあるようだ。
要件定義をしっかりすることが重要だと再認識した。
back_translate_proteins.py
#!/usr/bin/env python
import argparse
import sys
import os
from Bio import SeqIO, Seq
from Bio.Data import CodonTable
import tqdm
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stderr)
def calculate_gc_content(sequence):
"""Compute the GC content of a given sequence."""
return (sequence.count('G') + sequence.count('C')) / max(1, len(sequence))
def heuristic_back_translate(sequence, protein_to_codon, desired_gc):
nucleotide_seq = ""
for aa in sequence:
if aa not in protein_to_codon:
continue
current_gc = calculate_gc_content(nucleotide_seq)
codons = sorted(protein_to_codon[aa], key=calculate_gc_content)
if current_gc < desired_gc:
nucleotide_seq += codons[-1]
else:
nucleotide_seq += codons[0]
resulting_gc = calculate_gc_content(nucleotide_seq)
return nucleotide_seq, resulting_gc
def calculate_min_max_gc(sequence, protein_to_codon):
min_gc_seq = ''.join(sorted(protein_to_codon[aa], key=calculate_gc_content)[0] for aa in sequence if aa in protein_to_codon)
max_gc_seq = ''.join(sorted(protein_to_codon[aa], key=calculate_gc_content, reverse=True)[0] for aa in sequence if aa in protein_to_codon)
return calculate_gc_content(min_gc_seq), calculate_gc_content(max_gc_seq)
def process_record(record, protein_to_codon, desired_gc, file_format, output_file, gc_adjusted_count, tsv_report):
min_gc, max_gc = calculate_min_max_gc(str(record.seq), protein_to_codon)
current_desired_gc = desired_gc
note = ""
if current_desired_gc < min_gc:
current_desired_gc = min_gc
note = "desired GC% unreachable; adjusted to theoretical min"
gc_adjusted_count[0] += 1
elif current_desired_gc > max_gc:
current_desired_gc = max_gc
note = "desired GC% unreachable; adjusted to theoretical max"
gc_adjusted_count[0] += 1
translated_seq, resulting_gc = heuristic_back_translate(str(record.seq), protein_to_codon, current_desired_gc)
record.seq = Seq.Seq(translated_seq)
record.description += f" | Resulting GC: {resulting_gc*100:.2f}%"
with open(output_file, 'a') as outfile:
SeqIO.write(record, outfile, file_format)
tsv_report.append([record.id, len(record.seq), min_gc * 100, max_gc * 100, resulting_gc * 100, note])
return min_gc, max_gc, resulting_gc
def main(input_file, output_file, file_format, genetic_table_id, desired_gc, report_file):
table = CodonTable.unambiguous_dna_by_id[genetic_table_id]
protein_to_codon = {value: [key for key in table.forward_table if table.forward_table[key] == value] for value in table.forward_table.values()}
sum_min_gc = 0
sum_max_gc = 0
sum_actual_gc = 0
total_proteins = 0
gc_adjusted_count = [0]
tsv_report = []
records = list(SeqIO.parse(input_file, file_format))
for record in tqdm.tqdm(records, desc="Processing records"):
min_gc, max_gc, actual_gc = process_record(record, protein_to_codon, desired_gc/100, file_format, output_file, gc_adjusted_count, tsv_report)
sum_min_gc += min_gc
sum_max_gc += max_gc
sum_actual_gc += actual_gc
total_proteins += 1
avg_min_gc = (sum_min_gc / total_proteins) * 100
avg_max_gc = (sum_max_gc / total_proteins) * 100
avg_actual_gc = (sum_actual_gc / total_proteins) * 100
logging.info(f"Summary:\nTotal proteins back translated: {total_proteins}\nAverage theoretical minimum GC: {avg_min_gc:.2f}%\nAverage theoretical maximum GC: {avg_max_gc:.2f}%\nAverage actual GC% of the backtranslated sequences: {avg_actual_gc:.2f}%")
if gc_adjusted_count[0] > 0:
percentage = (gc_adjusted_count[0] / total_proteins) * 100
logging.warning(f"{gc_adjusted_count[0]} sequences ({percentage:.2f}%) were adjusted to fit the theoretical GC bounds.")
with open(report_file, 'w') as rpt_file:
rpt_file.write("id\tlength\ttheoretical_min_gc\ttheoretical_max_gc\tactual_gc\tnote\n")
for line in tsv_report:
rpt_file.write('\t'.join(map(str, line)) + '\n')
if __name__ == "__main__":
default_output_name = None
default_report_name = None
parser = argparse.ArgumentParser(description="Back-translate protein sequences into nucleotide sequences.")
parser.add_argument('-o', '--output', default=default_output_name, help=f'Output nucleotide sequence file. Default: {default_output_name}')
parser.add_argument('-f', '--format', default="fasta", help='File format (default: fasta).')
parser.add_argument('-t', '--table', type=int, default=1, help='Genetic table ID (default: 1).')
parser.add_argument('--gc', type=float, default=50, help='Desired GC content (default: 50%%).')
parser.add_argument('-r', '--report', default=default_report_name, help=f'Report file in TSV format detailing backtranslation results. Default: {default_report_name}')
parser.add_argument('-i', '--input', required=True, help='Input protein sequence file.')
args, _ = parser.parse_known_args()
if args.input:
base_name = os.path.splitext(os.path.basename(args.input))[0]
default_output_name = f"{base_name}.fna"
default_report_name = f"{base_name}_report.tsv"
parser.set_defaults(output=default_output_name, report=default_report_name)
if args.output:
base_name = os.path.splitext(os.path.basename(args.output))[0]
default_report_name = f"{base_name}_report.tsv"
parser.set_defaults(report=default_report_name)
args = parser.parse_args()
main(args.input, args.output, args.format, args.table, args.gc, args.report)
$ ./back_translate_proteins.py -h
usage: back_translate_proteins.py [-h] [-o OUTPUT] [-f FORMAT] [-t TABLE] [--gc GC] [-r REPORT] -i INPUT
Back-translate protein sequences into nucleotide sequences.
optional arguments:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
Output nucleotide sequence file. Default: None
-f FORMAT, --format FORMAT
File format (default: fasta).
-t TABLE, --table TABLE
Genetic table ID (default: 1).
--gc GC Desired GC content (default: 50%).
-r REPORT, --report REPORT
Report file in TSV format detailing backtranslation results. Default: None
-i INPUT, --input INPUT
Input protein sequence file.
$ ./back_translate_proteins.py -i GCF_000005845.2_ASM584v2_protein.faa -o GCF_000005845.2_ASM584v2_protein.gc55.fna -t 11 --gc 55
Processing records: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4298/4298 [00:16<00:00, 260.77it/s]
INFO:root:Summary:
Total proteins back translated: 4298
Average theoretical minimum GC: 30.01%
Average theoretical maximum GC: 67.23%
Average actual GC% of the backtranslated sequences: 54.86%
WARNING:root:37 sequences (0.86%) were adjusted to fit the theoretical GC bounds
$ ./back_translate_proteins.py -i GCF_000005845.2_ASM584v2_protein.faa -o GCF_000005845.2_ASM584v2_protein.gc45.fna -t 11 --gc 45
Processing records: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4298/4298 [00:15<00:00, 283.89it/s]
INFO:root:Summary:
Total proteins back translated: 4298
Average theoretical minimum GC: 30.01%
Average theoretical maximum GC: 67.23%
Average actual GC% of the backtranslated sequences: 45.04%
$ ./back_translate_proteins.py -i GCF_000005845.2_ASM584v2_protein.faa -o GCF_000005845.2_ASM584v2_protein.gc35.fna -t 11 --gc 35
Processing records: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4298/4298 [00:14<00:00, 298.23it/s]
INFO:root:Summary:
Total proteins back translated: 4298
Average theoretical minimum GC: 30.01%
Average theoretical maximum GC: 67.23%
Average actual GC% of the backtranslated sequences: 35.34%
WARNING:root:77 sequences (1.79%) were adjusted to fit the theoretical GC bounds.
GCF_000005845.2_ASM584v2_protein.gc55.fna
>NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 57.14%
ATGAAGCGGATTAGCACGACGATCACGACGACGATCACGATCACGACGGGGAATGGGGCT
GGT
>NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 55.00%
ATGCGGGTTTTAAAGTTCGGGGGGACGTCTGTGGCGAATGCGGAAAGATTCCTGCGGGTT
GCTGATATCCTGGAGAGCAATGCGCGGCAAGGTCAAGTTGCGACTGTTCTGAGCGCTCCT
GCTAAAATCACGAACCACCTGGTGGCGATGATTGAGAAGACGATCAGCGGGCAGGATGCG
TTACCGAATATCAGCGACGCGGAACGGATTTTCGCGGAACTGCTGACTGGGTTAGCGGCT
GCTCAACCTGGTTTTCCGTTAGCGCAATTAAAGACGTTCGTGGACCAGGAGTTCGCGCAG
ATTAAGCACGTGCTGCACGGGATTAGCCTGCTGGGGCAATGTCCGGATTCTATCAACGCG
GCTTTAATCTGCCGGGGGGAAAAAATGAGCATCGCGATCATGGCGGGGGTTCTGGAAGCG
AGAGGTCATAATGTGACGGTGATCGACCCGGTTGAGAAACTGCTGGCGGTTGGGCATTAT
CTGGAGAGCACTGTGGACATTGCGGAGTCTACGCGGAGAATTGCGGCTTCTCGGATTCCG
GCTGATCATATGGTGCTGATGGCGGGTTTTACGGCGGGTAATGAGAAAGGGGAGTTAGTG
GTGCTGGGGAGAAATGGGTCTGACTACAGCGCGGCTGTTTTAGCGGCTTGTCTGAGAGCG
GATTGTTGCGAAATCTGGACGGACGTGGATGGGGTTTATACGTGCGACCCGAGACAAGTG
CCTGATGCGAGATTACTGAAGAGCATGAGCTACCAGGAGGCGATGGAGCTGAGCTATTTC
GGGGCGAAAGTGCTGCACCCTAGAACGATTACGCCGATTGCGCAGTTTCAGATCCCGTGC
TTAATCAAGAACACGGGGAACCCGCAGGCGCCTGGTACTTTAATCGGGGCGTCTAGAGAC
GAAGACGAGTTACCGGTGAAAGGGATCAGCAACCTGAACAACATGGCGATGTTCAGCGTG
AGCGGGCCGGGTATGAAAGGGATGGTGGGTATGGCGGCTAGAGTTTTTGCGGCGATGTCT
AGAGCGAGAATTAGCGTGGTGCTGATCACGCAGAGCTCTAGCGAGTATAGCATCAGCTTC
TGCGTGCCGCAAAGCGACTGTGTGCGGGCTGAAAGAGCTATGCAGGAAGAGTTCTACCTG
GAGCTGAAGGAGGGGTTACTGGAGCCGTTAGCGGTTACTGAGCGGTTAGCGATTATCAGC
GTGGTGGGGGATGGTATGAGAACGCTGAGAGGGATTAGCGCGAAATTCTTCGCGGCGTTA
GCGAGAGCGAATATCAACATCGTGGCGATCGCGCAGGGTTCTAGCGAACGGTCTATTAGC
GTGGTGGTGAATAACGACGACGCGACGACTGGGGTTAGAGTGACGCATCAGATGCTGTTC
AACACGGACCAGGTGATCGAGGTGTTCGTGATCGGGGTTGGGGGTGTTGGTGGTGCTTTA
TTAGAGCAGCTGAAGCGGCAACAGAGCTGGTTAAAGAACAAGCACATCGACCTGCGGGTG
TGCGGGGTTGCTAATAGCAAGGCGCTGTTAACGAACGTGCACGGGTTAAACCTGGAGAAC
TGGCAGGAGGAGCTGGCGCAAGCTAAAGAGCCGTTTAACCTGGGGAGACTGATCCGGTTA
GTGAAGGAGTACCACCTGCTGAACCCGGTTATCGTGGACTGCACGTCTAGCCAGGCTGTT
GCTGATCAGTATGCGGACTTTCTGCGGGAAGGGTTTCACGTGGTGACTCCGAATAAGAAG
GCGAACACGAGCAGCATGGACTACTACCACCAGCTGCGGTATGCGGCTGAAAAGAGCCGG
AGAAAATTCCTGTACGACACGAACGTGGGGGCGGGTTTACCGGTTATTGAGAACCTGCAG
AACCTGCTGAACGCGGGGGATGAACTGATGAAGTTCAGCGGGATCCTGAGCGGGTCTTTA
AGCTACATCTTCGGGAAGCTGGACGAGGGGATGAGCTTTAGCGAGGCGACTACGTTAGCG
AGAGAGATGGGGTATACGGAGCCGGATCCTAGAGACGATCTGAGCGGTATGGACGTTGCG
AGAAAACTGCTGATCCTGGCGAGAGAGACTGGGAGAGAACTGGAGTTAGCGGACATTGAG
ATCGAGCCGGTTCTGCCGGCTGAATTTAACGCGGAGGGTGATGTGGCTGCTTTTATGGCG
AATCTGAGCCAGCTGGACGATCTGTTCGCGGCTAGAGTTGCGAAAGCGAGAGACGAAGGG
AAAGTGCTGAGATACGTGGGGAATATCGACGAGGACGGGGTTTGCAGAGTGAAGATCGCG
GAAGTGGACGGTAATGACCCGTTATTCAAGGTGAAGAACGGGGAGAACGCGCTGGCGTTT
TACAGCCACTACTACCAGCCGCTGCCGTTAGTGCTGAGAGGGTATGGGGCTGGTAATGAC
GTTACGGCGGCTGGTGTTTTTGCGGATTTACTGCGGACTCTGAGCTGGAAACTGGGGGTT
...
GCF_000005845.2_ASM584v2_protein.gc45.fna
>NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 49.21%
ATGAAGCGGATTAGCACTACGATTACGACGACTATTACGATCACGACGGGTAATGGGGCT
GGT
>NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 45.04%
ATGCGGGTTTTAAAGTTCGGGGGTACTTCTGTTGCTAATGCGGAAAGATTTCTGCGGGTT
GCTGATATTCTGGAATCTAACGCGAGACAAGGTCAAGTTGCTACTGTTTTAAGCGCGCCT
GCTAAAATTACGAATCACCTGGTTGCTATGATTGAGAAGACGATTAGCGGGCAAGATGCT
TTACCGAATATCAGCGATGCGGAAAGAATTTTCGCGGAATTACTGACGGGTTTAGCGGCT
GCTCAACCTGGTTTTCCTTTAGCTCAATTAAAAACGTTCGTGGATCAGGAATTTGCGCAA
ATTAAGCACGTGCTGCATGGGATTTCTCTGTTAGGGCAATGTCCTGATTCTATTAACGCG
GCTTTAATCTGCCGGGGTGAAAAAATGAGCATTGCGATTATGGCGGGTGTTTTAGAGGCT
AGAGGTCATAATGTGACTGTGATTGACCCGGTTGAAAAACTGTTAGCGGTTGGTCATTAT
CTGGAGTCTACGGTTGATATCGCGGAATCTACTAGACGGATTGCTGCTTCTAGAATTCCG
GCTGATCATATGGTTTTAATGGCGGGTTTTACGGCTGGTAATGAAAAGGGGGAATTAGTG
GTTCTGGGTAGAAATGGGTCTGATTATAGCGCGGCTGTTTTAGCTGCTTGTTTACGGGCT
GATTGTTGTGAAATTTGGACGGATGTTGACGGTGTTTATACGTGTGACCCTAGACAAGTT
CCTGATGCTAGATTACTGAAGAGCATGAGCTATCAGGAGGCTATGGAATTAAGCTACTTC
GGGGCTAAAGTGTTACACCCGAGAACTATTACGCCTATTGCGCAATTTCAGATTCCGTGT
TTAATCAAGAACACGGGGAACCCGCAAGCTCCTGGTACTTTAATTGGGGCTTCTAGAGAT
GAAGATGAACTGCCTGTTAAAGGGATTAGCAATCTGAACAACATGGCGATGTTTAGCGTG
TCTGGGCCTGGTATGAAAGGTATGGTTGGTATGGCTGCTAGAGTTTTTGCTGCTATGTCT
AGAGCTAGAATTAGCGTTGTGTTAATCACGCAGTCTAGCTCTGAATACAGCATTAGCTTC
TGCGTGCCTCAATCTGATTGCGTTAGAGCGGAAAGAGCTATGCAAGAAGAATTTTACCTG
GAGCTGAAAGAGGGGTTACTGGAACCTTTAGCGGTTACTGAAAGACTGGCTATTATCAGC
GTTGTGGGTGATGGTATGAGAACTTTACGGGGTATTAGCGCTAAATTCTTCGCGGCTTTA
GCTAGAGCTAATATCAACATCGTGGCGATTGCGCAAGGTTCTTCTGAAAGATCTATCAGC
GTTGTGGTTAACAACGACGACGCTACTACTGGTGTTAGAGTTACTCACCAAATGCTGTTT
AACACGGACCAGGTTATTGAGGTGTTTGTGATCGGGGTTGGTGGTGTTGGTGGTGCTTTA
TTAGAACAACTGAAACGGCAACAATCTTGGTTAAAGAACAAGCACATCGACCTGCGGGTT
TGTGGTGTTGCTAATTCTAAGGCGTTACTGACTAATGTGCACGGTTTAAACCTGGAGAAT
TGGCAGGAAGAGTTAGCGCAAGCTAAAGAGCCTTTTAACCTGGGTAGATTAATCCGGTTA
GTGAAGGAGTACCACTTACTGAACCCGGTTATTGTGGACTGTACTAGCTCTCAAGCGGTT
GCTGATCAATATGCGGATTTTCTGAGAGAGGGTTTTCACGTTGTTACGCCTAATAAGAAG
GCGAATACGAGCTCTATGGACTATTACCACCAGTTACGGTATGCGGCTGAAAAAAGCAGA
AGAAAGTTCCTGTACGACACGAATGTGGGGGCTGGTTTACCTGTTATTGAAAATCTGCAG
AATCTGCTGAACGCGGGTGATGAATTAATGAAGTTCAGCGGGATTCTGTCTGGTTCTTTA
AGCTACATCTTCGGGAAACTGGACGAAGGGATGTCTTTTAGCGAAGCGACTACTTTAGCG
AGAGAAATGGGGTATACTGAGCCTGATCCTAGAGATGATTTAAGCGGGATGGATGTTGCT
AGAAAACTGCTGATTCTGGCGAGAGAAACTGGTAGAGAATTAGAGCTGGCTGATATTGAG
ATTGAGCCGGTTTTACCGGCTGAATTTAACGCGGAAGGTGATGTTGCTGCTTTTATGGCT
AATCTGTCTCAGTTAGACGACTTATTCGCGGCTAGAGTTGCTAAAGCGAGAGATGAAGGT
AAAGTGTTACGGTATGTGGGGAATATTGACGAGGATGGGGTTTGTAGAGTTAAAATCGCG
GAAGTGGATGGGAATGATCCGTTATTTAAGGTGAAGAACGGGGAAAACGCGTTAGCGTTT
TATAGCCACTACTACCAGCCGTTACCTTTAGTGTTACGGGGTTATGGGGCTGGTAATGAT
GTTACTGCTGCTGGTGTTTTTGCTGATTTACTGAGAACGTTAAGCTGGAAACTGGGGGTT
...
GCF_000005845.2_ASM584v2_protein.gc35.fna
>NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 39.68%
ATGAAGCGGATTTCTACTACTATTACGACTACTATTACGATTACGACTGGTAATGGTGCT
GGT
>NP_414543.1 fused aspartate kinase/homoserine dehydrogenase 1 [Escherichia coli str. K-12 substr. MG1655] | Resulting GC: 35.08%
ATGCGGGTTTTAAAATTCGGGGGTACTTCTGTTGCTAATGCTGAAAGATTTTTAAGAGTT
GCTGATATTTTAGAGTCTAATGCGAGACAAGGTCAAGTTGCTACTGTTTTATCTGCTCCT
GCTAAAATTACTAATCATTTAGTTGCTATGATTGAAAAAACGATTAGCGGTCAAGATGCT
TTACCTAATATTAGCGATGCTGAAAGAATTTTTGCGGAATTATTAACGGGTTTAGCTGCT
GCTCAACCTGGTTTTCCTTTAGCTCAATTAAAAACTTTTGTTGATCAAGAATTTGCTCAA
ATTAAGCACGTTTTACACGGTATTTCTTTACTGGGTCAATGTCCTGATTCTATTAATGCT
GCTTTAATTTGCAGAGGTGAAAAAATGTCTATTGCGATTATGGCTGGTGTTTTAGAAGCT
AGAGGTCATAATGTTACTGTTATTGATCCTGTTGAAAAATTACTGGCTGTTGGTCATTAT
TTAGAGTCTACTGTTGATATTGCGGAATCTACTAGAAGAATTGCTGCTTCTAGAATTCCT
GCTGATCATATGGTTTTAATGGCTGGTTTTACTGCTGGTAATGAAAAAGGTGAATTAGTT
GTTTTAGGTAGAAATGGTTCTGATTATTCTGCTGCTGTTTTAGCTGCTTGTTTAAGAGCT
GATTGTTGTGAAATTTGGACTGATGTTGATGGTGTTTATACTTGTGATCCTAGACAAGTT
CCTGATGCTAGATTATTAAAATCTATGTCTTATCAAGAAGCTATGGAATTATCTTATTTC
GGGGCTAAAGTTTTACACCCTAGAACTATTACTCCTATTGCTCAATTTCAAATTCCGTGT
TTAATCAAGAACACGGGTAATCCTCAAGCTCCTGGTACTTTAATTGGTGCTTCTAGAGAT
GAAGATGAATTACCTGTTAAAGGTATTTCTAATTTAAATAACATGGCGATGTTTTCTGTT
TCTGGGCCTGGTATGAAAGGTATGGTTGGTATGGCTGCTAGAGTTTTTGCTGCTATGTCT
AGAGCTAGAATTTCTGTTGTTTTAATTACTCAATCTTCTTCTGAATATTCTATTTCTTTT
TGTGTTCCTCAATCTGATTGTGTTAGAGCTGAAAGAGCTATGCAAGAAGAATTTTATTTA
GAACTGAAAGAGGGTTTATTAGAGCCTTTAGCTGTTACTGAAAGATTAGCTATTATTAGC
GTTGTGGGTGATGGTATGAGAACTTTAAGAGGTATTTCTGCTAAATTTTTTGCGGCTTTA
GCTAGAGCTAATATTAATATCGTGGCTATTGCTCAAGGTTCTTCTGAAAGATCTATTTCT
GTTGTTGTTAATAACGACGATGCTACTACTGGTGTTAGAGTTACTCATCAAATGTTATTT
AATACGGACCAAGTTATTGAGGTTTTTGTGATTGGGGTTGGTGGTGTTGGTGGTGCTTTA
TTAGAACAATTAAAAAGACAACAATCTTGGTTAAAAAATAAGCACATTGACTTACGGGTT
TGTGGTGTTGCTAATTCTAAAGCTTTATTAACGAATGTGCATGGTTTAAATCTGGAAAAT
TGGCAGGAAGAATTAGCGCAAGCTAAAGAACCTTTTAATTTAGGGAGATTAATTCGGTTA
GTTAAGGAGTATCACTTACTGAATCCGGTTATTGTTGATTGTACTTCTTCTCAAGCTGTT
GCTGATCAATATGCTGATTTTTTACGGGAAGGTTTTCATGTTGTTACTCCTAATAAAAAA
GCGAATACGTCTTCTATGGATTATTACCACCAATTACGGTATGCTGCTGAAAAATCTAGA
AGAAAATTCCTGTATGACACTAATGTGGGTGCTGGTTTACCTGTTATTGAAAATTTACAA
AATCTGCTGAATGCGGGTGATGAATTAATGAAATTTAGCGGTATTTTAAGCGGTTCTTTA
TCTTACATCTTCGGGAAATTAGACGAAGGTATGTCTTTTTCTGAAGCTACTACTTTAGCT
AGAGAAATGGGTTATACTGAACCTGATCCTAGAGATGATTTATCTGGTATGGATGTTGCT
AGAAAATTATTAATCCTGGCTAGAGAAACTGGTAGAGAATTAGAATTAGCTGATATTGAA
ATTGAGCCGGTTTTACCTGCTGAATTTAATGCTGAAGGTGATGTTGCTGCTTTTATGGCT
AATTTATCTCAATTAGATGATTTATTCGCGGCTAGAGTTGCTAAAGCTAGAGATGAAGGT
AAAGTTTTAAGATATGTTGGTAATATTGACGAAGATGGTGTTTGTAGAGTTAAAATTGCG
GAAGTTGATGGTAATGATCCTTTATTTAAGGTGAAAAACGGGGAAAATGCTTTAGCTTTT
TATAGCCACTATTACCAGCCTTTACCTTTAGTTTTACGGGGTTATGGTGCTGGTAATGAT
GTTACTGCTGCTGGTGTTTTTGCTGATTTATTAAGAACTTTATCTTGGAAATTAGGTGTT
...
GCF_000005845.2_ASM584v2_protein.gc35_report.tsv
id length theoretical_min_gc theoretical_max_gc actual_gc note
NP_414542.1 63 30.158730158730158 63.49206349206349 39.682539682539684
NP_414543.1 2460 30.48780487804878 68.21138211382114 35.081300813008134
NP_414544.1 930 33.01075268817204 70.0 35.16129032258065
NP_414545.1 1284 29.906542056074763 67.91277258566979 35.046728971962615
NP_414546.1 294 37.07482993197279 72.10884353741497 40.476190476190474 desired GC% unreachable; adjusted to theoretical min
NP_414547.1 774 25.839793281653744 64.08268733850129 35.012919896640824
NP_414548.1 1428 31.72268907563025 67.78711484593838 35.22408963585434
NP_414549.1 951 29.02208201892745 66.5615141955836 34.910620399579386
NP_414550.1 585 32.30769230769231 70.25641025641025 35.72649572649573
...
GCF_000005845.2_ASM584v2_protein.gc45_report.tsv
id length theoretical_min_gc theoretical_max_gc actual_gc note
NP_414542.1 63 30.158730158730158 63.49206349206349 49.2063492063492
NP_414543.1 2460 30.48780487804878 68.21138211382114 45.040650406504064
NP_414544.1 930 33.01075268817204 70.0 44.946236559139784
NP_414545.1 1284 29.906542056074763 67.91277258566979 45.01557632398754
NP_414546.1 294 37.07482993197279 72.10884353741497 45.23809523809524
NP_414547.1 774 25.839793281653744 64.08268733850129 44.96124031007752
NP_414548.1 1428 31.72268907563025 67.78711484593838 45.02801120448179
NP_414549.1 951 29.02208201892745 66.5615141955836 44.900105152471085
NP_414550.1 585 32.30769230769231 70.25641025641025 44.95726495726496
...
GCF_000005845.2_ASM584v2_protein.gc55_report.tsv
id length theoretical_min_gc theoretical_max_gc actual_gc note
NP_414542.1 63 30.158730158730158 63.49206349206349 57.14285714285714
NP_414543.1 2460 30.48780487804878 68.21138211382114 55.00000000000001
NP_414544.1 930 33.01075268817204 70.0 54.83870967741935
NP_414545.1 1284 29.906542056074763 67.91277258566979 54.82866043613706
NP_414546.1 294 37.07482993197279 72.10884353741497 55.44217687074829
NP_414547.1 774 25.839793281653744 64.08268733850129 55.03875968992248
NP_414548.1 1428 31.72268907563025 67.78711484593838 54.971988795518214
NP_414549.1 951 29.02208201892745 66.5615141955836 54.99474237644585
NP_414550.1 585 32.30769230769231 70.25641025641025 55.042735042735046
...