Motif Finding
遺伝子の中で既知の配列を見つけ出す際には、その配列が遺伝子の中にあるかどうか判定すれば良い。
しかし、パターンが未知である場合や変異が含まれている場合は別の方法を使う必要がある。
その方法の一つにMotifFindingがある。
これは、複数の配列を比較しながら類似しているパターンを見つけ出す方法である。
そこで今回はMotifiFindigをスクラッチで実装した。
なお、これらコード等を作成するにあたりCourseraのBioinformatics Iという講座を参考にした。
目次
- Pattern Match
- Hamming Distance
- Approximate Pattern Match
- Median String
- その他のMotif Findings
- Profile関数
- Greedy Motif Finding
- Randomized Motif Finding
Pattern Match
まず初めに既知の配列を見つけ出すパターンマッチを実装した。
# サンプルデータ
sample_genome = "GATATATGCATATACTT"
sample_pattern = "ATAT"
# パターンマッチ関数
def PatternMatch(text, pattern):
n = len(text)
k = len(pattern)
positions = []
for i in range(n-k+1):
ex = text[i:i+k]
if ex == pattern:
positions.append(i)
return positions
PatternMatch(sample_genome, sample_pattern)
>[1, 3, 9]
Hamming Distance
2つの配列を比較し、その配列の類似度の定義にHamming Distanceがある
これは単純に2つ配列を比較した時異なる文字の数である。
# サンプルデータ
p = "GGGCCGTTGGT"
q = "GGACCGTTGAC"
# Hamming Distance関数
def HammingDistance(p, q):
hamming_distance = 0
for i in range(len(p)):
if p[i] != q[i]:
hamming_distance += 1
return hamming_distance
HammingDistance(p, q)
>3
Approximate Pattern Match
配列同士を比較した際、任意の数一致しないことを許容した上でパターンを探す
# サンプルデータ
text = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT"
# パターン
pattern = "ATTCTGGA"
# 一致しないことを許容する文字数
d = 3
def ApproximatePatternMatching(text, pattern, d):
k = len(pattern)
n = len(text)
positions = []
for i in range(n - k + 1):
ex = text[i:i+k]
h_distance = HammingDistance(ex, pattern)
if h_distance <= d:
positions.append(i)
return positions
ApproximatePatternMatching(text, pattern ,d)
>[6, 7, 26, 27]
Median String
Hamming Distanceが最小になるように、複数配列の中で一致するパターンを見つける。
k = 3
Dna = "AAATTGACGCAT GACGACCACGTT CGTCAGCGCCTG GCTGAGCACCGG AGTTCGGGACAG"
Dna = Dna.split()
def median_string(DNAs,k):
kmerlist = []
for string in DNAs:
for i in range(0,len(string)-k+1):
pattern = string[i:i+k]
if pattern not in kmerlist:
kmerlist.append(pattern)
for kmer in kmerlist:
score = 0
for string in DNAs:
min_distance=1000
for i in range(0,len(string)-k+1):
pattern = string[i:i+k]
distance = HammingDistance(kmer,pattern)
if min_distance >= distance:
min_distance = distance
score += min_distance
if kmer == kmerlist[0]:
bestscore = score
median = kmer
if bestscore >= score:
bestscore = score
median = kmer
return median,bestscore
median_string(Dna, 3)
>('GAC', 2)
その他のMotif Findings
Profile関数
複数配列の中で、文字の出現率を表にする
Dna = "GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG"
Dna = Dna.split()
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A':[1]*k,'C':[1]*k,'G':[1]*k,'T':[1]*k}
t = len(Motifs)
for i in range(t):
for j in range(k):
count[Motifs[i][j]][j] += 1
return count
def ProfileWithPseudocounts(Motifs):
t = len(Motifs)
k = len(Motifs[0])
profile = CountWithPseudocounts(Motifs)
for i in range(k):
su=0
for symbol in "ACGT":
su += profile[symbol][i]
for symbol in "ACGT":
profile[symbol][i] = round(profile[symbol][i]/su, 2)
return profile
ProfileWithPseudocounts(Dna)
>{'A': [0.22, 0.56, 0.33, 0.22, 0.33, 0.33, 0.22, 0.56, 0.11, 0.11, 0.22, 0.33],
'C': [0.44, 0.11, 0.33, 0.11, 0.11, 0.22, 0.33, 0.11, 0.11, 0.33, 0.44, 0.33],
'G': [0.22, 0.22, 0.22, 0.44, 0.22, 0.11, 0.22, 0.11, 0.33, 0.22, 0.22, 0.22],
'T': [0.11, 0.11, 0.11, 0.22, 0.33, 0.33, 0.22, 0.22, 0.44, 0.33, 0.11, 0.11]}
Greedy Motif Search
k = 3
t = 5
Dna = "GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG"
Dna = Dna.split()
def GreedyMotifSearchWithPseudocounts(Dna, k, t):
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
n = len(Dna[0])
for m in range(n-k+1):
Motifs = []
Motifs.append(Dna[0][m:m+k])
for j in range(1, t):
P = ProfileWithPseudocounts(Motifs[0:j])
Motifs.append(ProfileMostProbablePattern(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A':[1]*k,'C':[1]*k,'G':[1]*k,'T':[1]*k}
t = len(Motifs)
for i in range(t):
for j in range(k):
count[Motifs[i][j]][j] += 1
return count
def ProfileWithPseudocounts(Motifs):
t = len(Motifs)
k = len(Motifs[0])
profile = CountWithPseudocounts(Motifs)
for i in range(k):
su=0
for symbol in "ACGT":
su += profile[symbol][i]
for symbol in "ACGT":
profile[symbol][i] = profile[symbol][i]/su
return profile
def Pr(Text, Profile):
# insert your code here
count = 1
for i, key in enumerate(Text):
count *= Profile.get(key)[i]
return count
def Score(Motifs):
consensus = Consensus(Motifs)
count = 0
for motif in Motifs:
for idx in range(len(motif)):
if motif[idx] != consensus[idx]:
count += 1
return count
def ProfileMostProbablePattern(Text, k, Profile):
most_prob = Text[0:k]
p_max = Pr(Text[0:k], Profile)
for i in range(1, len(Text) - k + 1):
if Pr(Text[i:i+k], Profile) > p_max:
p_max = Pr(Text[i:i+k], Profile)
most_prob = Text[i:i+k]
return most_prob
def Consensus(Motifs):
k = len(Motifs[0])
count = CountWithPseudocounts(Motifs)
consensus = ""
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
GreedyMotifSearchWithPseudocounts(Dna, k, t)
>TTC ATC TTC ATC TTC
Randomized Motif Finding
import random
Dna = \
"CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA \
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG \
TAGTACCGAGACCGAAAGAAGTATACAGGCGT \
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC \
AATCCACCAGCTCCACGTGCAATGTTGGCCTA"
Dna = Dna.split()
k = 8
t = 5
def ProfileProbMotif(Dna, k, profile):
profile_motif = []
for i in range(len(Dna)):
profile_motif.append(ProfileMostProbablePattern(Dna[i], k, profile))
return profile_motif
def RandomMotif(Dna, k, t):
Motif = []
Upper = len(Dna[0]) - k
for i in range(t):
pos = random.randint(0, Upper)
Motif.append(Dna[i][pos:pos+k])
return Motif
def RandomizedMotifsSearch(Dna, k, t):
M = RandomMotif(Dna, k, t)
BestMotifs = M
while True:
current_profile = ProfileWithPseudocounts(M)
M = ProfileProbMotif(Dna, k, current_profile)
if Score(M) < Score(BestMotifs):
BestMotifs = M
else:
return BestMotifs
def RepeatedRandomizedMotifSearch(Dna, k, t):
BestScore = float("inf")
BestMotifs = []
for i in range(1000):
Motif = RandomizedMotifsSearch(Dna, k, t)
current_score = Score(Motif)
if current_score < BestScore:
BestScore = current_score
BestMotifs = Motif
return BestMotifs
RepeatedRandomizedMotifSearch(Dna, k, t)
>['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']