アライメントをスクラッチで実装
遺伝子解析をする際、短い遺伝子断片をつなぎ合わせて長い一つの遺伝子につなぎ直す。
この作業をアライメントという。
今回はその配列をつなぐ処理をスクラッチで実装した。
なお、この記事はCourseraのBioinformatics IIという講座を参考にしている。
また、グラフ理論に基づいたオイラー路の探索などはYoutubeなどを参考にした。
目次
- Solve string spell
- Overlap graph
- DeBruijin Graph
- Check Euler
- Euler Path
- Euler Ordering
- Solve Spell
- Pairのパターン
- Paired DeBruijin
- Paired DeBruijin check euler & euler path
- Convert
- String Spell By Gapped
Solve String Spell
一つづつずれている複数の配列が得られた時、つなぎ直す処理。
また、その配列は順番に並んでいる。
text = "ACCGA CCGAA CGAAG GAAGC AAGCT"
text = text.split()
def SolveStringSpell(text):
pattern = ""
for pos, kmer in enumerate(text):
if pos == 0:
pattern += kmer
else:
pattern += kmer[-1]
return pattern
SolveStringSpell(text)
>'ACCGAAGCT'
Overlap graph
一つづつズレている複数の配列で、順番に並んでいない時の処理。
どの配列がどの配列と繋がるのかを、表で返す関数
text = "ATGCG GCATG CATGC AGGCA GGCAT GGCAC"
text = text.split()
def Prefix(pattern):
return pattern[:-1]
def Suffix(pattern):
return pattern[1:]
def OverLap(text):
OverLap = {}
for pattern in text:
for kmer in text:
if pattern == kmer:
continue
if Suffix(pattern) == Prefix(kmer):
if pattern in OverLap:
OverLap[pattern].append(kmer)
else:
OverLap[pattern] = [kmer]
return OverLap
OverLap(text)
>{'GCATG': ['CATGC'],
'CATGC': ['ATGCG'],
'AGGCA': ['GGCAT', 'GGCAC'],
'GGCAT': ['GCATG']}
DeBruijin Graph
Overlap graphのように、配列同士の繋ぎ方を返す処理。
data = "CTTA ACCA TACC GGCT GCTT TTAC"
data = data.split()
def sort_dict(data):
sort_data = sorted(data.items())
submit = {}
for i in range(len(sort_data)):
submit[sort_data[i][0]] = sort_data[i][1]
return submit
def Bruijin(text):
Bruijn_Graph = {}
for pattern in text:
kmer = pattern[:-1]
if kmer in Bruijn_Graph:
Bruijn_Graph[kmer].append(pattern[1:])
else:
Bruijn_Graph[kmer] = [pattern[1:]]
Bruijn_Graph = sort_dict(Bruijn_Graph)
return Bruijn_Graph
graph = Bruijin(data)
graph
>{'ACC': ['CCA'],
'CTT': ['TTA'],
'GCT': ['CTT'],
'GGC': ['GCT'],
'TAC': ['ACC'],
'TTA': ['TAC']}
Check Euler
オイラーグラフになっているかどうか返す関数
もしstart,endがあればそのノードも返す
def Check_Euler_Path(graph):
import copy
from collections import Counter
in_g = {}
out_g = {}
start_node = None
end_node = None
for key, values in graph.items():
out_g[key] = len(values)
d = Counter(values)
for k, v in d.items():
if k in in_g:
in_g[k] += 1
else:
in_g[k] = 1
keys = set(list(in_g.keys()) + list(out_g.keys()))
for k in keys:
if (k in out_g) and (k in in_g):
if in_g[k] != out_g[k]:
if out_g[k] - in_g[k] == 1:
start_node = k
elif in_g[k] - out_g[k] == 1:
end_node = k
else:
print("False")
elif (k not in out_g) and (k in in_g):
end_node = k
elif (k in out_g) and (k not in in_g):
start_node = k
else:
print("debug")
return start_node, end_node
start_node, end_node = Check_Euler_Path(graph)
start_node, end_node
>('GGC', 'CCA')
Euler Path
オイラーパスを求める関数
def Euler_Path(graph, start_node, end_node):
import copy
import random
from collections import Counter
if start_node != None and end_node != None:
if end_node in graph:
graph[end_node].append(start_node)
else:
graph[end_node] = [start_node]
original_graph = copy.deepcopy(graph)
stuck = []
circuit = []
location = random.choice(list(graph.keys()))
while True:
edge_list = graph[location]
if len(edge_list) == 0:
v = stuck.pop()
circuit.append(str(v))
location = v
else:
edge = edge_list[0]
graph[location].pop(0)
stuck.append(location)
location = edge
if len(stuck) == 0:
rest_of_edges = sum(len(v) for v in graph.values())
if rest_of_edges == 0:
circuit.reverse()
return circuit
else:
graph = copy.deepcopy(original_graph)
location = random.choice(list(graph.keys()))
stuck = []
circuit = []
euler_path = Euler_Path(graph, start_node, end_node)
euler_path
>['GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA', 'GGC']
Euler Ordering
オイラーパスをstartからendまで順番に並べる
def EulerOrdering(result, start_node, end_node):
new_list = result
for i in range(len(result)):
if result[i] == str(end_node) and result[i+1] == str(start_node):
new_list = result[i+1:] + result[:i+1]
return new_list
euler_orderd = EulerOrdering(euler_path, start_node, end_node)
euler_orderd
>['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']
Solve Spell
先ほどの関数で文字列を並べる
result = SolveStringSpell(euler_orderd)
result
>'GGCTTACCA'
Pairのパターン
得られた配列の断片がpairのパターン
Paired DeBruijin
sample_pair_data = "GAGA|TTGA TCGT|GATG CGTG|ATGT TGGT|TGAG GTGA|TGTT GTGG|GTGA TGAG|GTTG GGTC|GAGA GTCG|AGAT".split()
input_data = []
for pair_data in sample_pair_data:
input_data.append(pair_data.split("|"))
def Paired_Bruijin(input_data):
Bruijin_Graph = {}
for pattern in input_data:
left_key = pattern[0][:-1]
right_key = pattern[1][:-1]
left_value = pattern[0][1:]
right_value = pattern[1][1:]
keys = left_key + "|" + right_key
values = left_value + "|" + right_value
if keys in Bruijin_Graph:
Bruijin_Graph[keys].append(values)
else:
Bruijin_Graph[keys] = [values]
Bruijin_Graph = sort_dict(Bruijin_Graph)
return Bruijin_Graph
graph = Paired_Bruijin(input_data)
graph
>{'CGT|ATG': ['GTG|TGT'],
'GAG|TTG': ['AGA|TGA'],
'GGT|GAG': ['GTC|AGA'],
'GTC|AGA': ['TCG|GAT'],
'GTG|GTG': ['TGG|TGA'],
'GTG|TGT': ['TGA|GTT'],
'TCG|GAT': ['CGT|ATG'],
'TGA|GTT': ['GAG|TTG'],
'TGG|TGA': ['GGT|GAG']}
Paired DeBruijin Check Euler & Euler Path
start_node, end_node = Check_Euler_Path(graph)
start_node, end_node
>('GTG|GTG', 'AGA|TGA')
euler_path = Euler_Path(graph, start_node, end_node)
euler_path
>['TCG|GAT',
'CGT|ATG',
'GTG|TGT',
'TGA|GTT',
'GAG|TTG',
'AGA|TGA',
'GTG|GTG',
'TGG|TGA',
'GGT|GAG',
'GTC|AGA']
euler_orderd = EulerOrdering(euler_path, start_node, end_node)
euler_orderd
>['GTG|GTG',
'TGG|TGA',
'GGT|GAG',
'GTC|AGA',
'TCG|GAT',
'CGT|ATG',
'GTG|TGT',
'TGA|GTT',
'GAG|TTG',
'AGA|TGA']
Convert
表示形式の変更
def Convert(circuit):
strings = []
for c in circuit:
strings.append(c.split("|"))
return strings
euler_converted = Convert(euler_orderd)
euler_converted
>[['GTG', 'GTG'],
['TGG', 'TGA'],
['GGT', 'GAG'],
['GTC', 'AGA'],
['TCG', 'GAT'],
['CGT', 'ATG'],
['GTG', 'TGT'],
['TGA', 'GTT'],
['GAG', 'TTG'],
['AGA', 'TGA']]
String Spell By Gapped
ギャップがある場合の文字列の再構成
# kmer
k = 4
# gap
d = 2
def StringSpelledByGappedPatterns(text, k, d, l):
n = len(text)
read_1 = ""
read_2 = ""
for i in range(n - 1):
read_1 += text[i][0][0]
read_2 += text[i][1][0]
read_1 += text[-1][0]
read_2 += text[-1][1]
str_length = k + d + k + l - 1
read_length = len(read_1)
unmatch_length = str_length - read_length
if read_1[unmatch_length:] == read_2[:read_length-unmatch_length]:
result = read_1 + read_2[read_length-unmatch_length:]
return result
else:
return "UnMatch!"
result = StringSpelledByGappedPatterns(euler_converted, k, d, len(input_data))
result
>'GTGGTCGTGAGATGTTGA'