More than 1 year has passed since last update.

遺伝子塩基配列の検索をスクラッチで実装1

バイオインフォマティクス

Posted at 2022-05-22

Motif Finding

遺伝子の中で既知の配列を見つけ出す際には、その配列が遺伝子の中にあるかどうか判定すれば良い。
しかし、パターンが未知である場合や変異が含まれている場合は別の方法を使う必要がある。
その方法の一つにMotifFindingがある。
これは、複数の配列を比較しながら類似しているパターンを見つけ出す方法である。
そこで今回はMotifiFindigをスクラッチで実装した。
なお、これらコード等を作成するにあたりCourseraのBioinformatics Iという講座を参考にした。

# サンプルデータ
sample_genome = "GATATATGCATATACTT"
sample_pattern = "ATAT"

# パターンマッチ関数
def PatternMatch(text, pattern):
    n = len(text)
    k = len(pattern)
    positions = []
    for i in range(n-k+1):
        ex = text[i:i+k]
        if ex == pattern:
            positions.append(i)
    return positions
PatternMatch(sample_genome, sample_pattern)
>[1, 3, 9]

Hamming Distance

2つの配列を比較し、その配列の類似度の定義にHamming Distanceがある
これは単純に2つ配列を比較した時異なる文字の数である。

# サンプルデータ
p = "GGGCCGTTGGT"
q = "GGACCGTTGAC"

# Hamming Distance関数
def HammingDistance(p, q):
    hamming_distance = 0
    for i in range(len(p)):
        if p[i] != q[i]:
            hamming_distance += 1
    return hamming_distance
HammingDistance(p, q)
>3

Approximate Pattern Match

配列同士を比較した際、任意の数一致しないことを許容した上でパターンを探す

# サンプルデータ
text = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT"
# パターン
pattern = "ATTCTGGA"
# 一致しないことを許容する文字数
d = 3

def ApproximatePatternMatching(text, pattern, d):
    k = len(pattern)
    n = len(text)
    positions = []
    for i in range(n - k + 1):
        ex = text[i:i+k]
        h_distance = HammingDistance(ex, pattern)
        if h_distance <= d:
            positions.append(i)
    return positions
ApproximatePatternMatching(text, pattern ,d)
>[6, 7, 26, 27]

Median String

Hamming Distanceが最小になるように、複数配列の中で一致するパターンを見つける。

k = 3
Dna = "AAATTGACGCAT GACGACCACGTT CGTCAGCGCCTG GCTGAGCACCGG AGTTCGGGACAG"
Dna = Dna.split()

def median_string(DNAs,k):
    kmerlist = []
    for string in DNAs:
        for i in range(0,len(string)-k+1):
            pattern = string[i:i+k]
            if pattern not in kmerlist:
                kmerlist.append(pattern)
    for kmer in kmerlist:
        score = 0
        for string in DNAs:
            min_distance=1000
            for i in range(0,len(string)-k+1):
                pattern = string[i:i+k]
                distance = HammingDistance(kmer,pattern)
                if min_distance >= distance:
                    min_distance = distance
            score += min_distance
            
        if kmer == kmerlist[0]:
            bestscore = score
            median = kmer
        if bestscore >= score:
            bestscore = score
            median = kmer
    return median,bestscore
median_string(Dna, 3)
>('GAC', 2)

その他のMotif Findings

Profile関数

複数配列の中で、文字の出現率を表にする

Dna = "GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG"
Dna = Dna.split()


def CountWithPseudocounts(Motifs):
    k = len(Motifs[0])
    count = {'A':[1]*k,'C':[1]*k,'G':[1]*k,'T':[1]*k}

    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            count[Motifs[i][j]][j] += 1
    return count

def ProfileWithPseudocounts(Motifs):
    t = len(Motifs)
    k = len(Motifs[0])
    profile = CountWithPseudocounts(Motifs)
    
    for i in range(k):
        su=0
        for symbol in "ACGT":
            su += profile[symbol][i]
        for symbol in "ACGT":
            profile[symbol][i] = round(profile[symbol][i]/su, 2)
            
    return profile
ProfileWithPseudocounts(Dna)
>{'A': [0.22, 0.56, 0.33, 0.22, 0.33, 0.33, 0.22, 0.56, 0.11, 0.11, 0.22, 0.33],
 'C': [0.44, 0.11, 0.33, 0.11, 0.11, 0.22, 0.33, 0.11, 0.11, 0.33, 0.44, 0.33],
 'G': [0.22, 0.22, 0.22, 0.44, 0.22, 0.11, 0.22, 0.11, 0.33, 0.22, 0.22, 0.22],
 'T': [0.11, 0.11, 0.11, 0.22, 0.33, 0.33, 0.22, 0.22, 0.44, 0.33, 0.11, 0.11]}

Greedy Motif Search

k = 3
t = 5
Dna = "GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG"
Dna = Dna.split()

def GreedyMotifSearchWithPseudocounts(Dna, k, t):
    BestMotifs = []
    for i in range(0, t):
        BestMotifs.append(Dna[i][0:k])
    n = len(Dna[0])
    for m in range(n-k+1):
        Motifs = []
        Motifs.append(Dna[0][m:m+k])
        for j in range(1, t):
            P = ProfileWithPseudocounts(Motifs[0:j])
            Motifs.append(ProfileMostProbablePattern(Dna[j], k, P))
        if Score(Motifs) < Score(BestMotifs):
            BestMotifs = Motifs
    return BestMotifs

def CountWithPseudocounts(Motifs):
    k = len(Motifs[0])
    count = {'A':[1]*k,'C':[1]*k,'G':[1]*k,'T':[1]*k}

    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            count[Motifs[i][j]][j] += 1
    return count

def ProfileWithPseudocounts(Motifs):
    t = len(Motifs)
    k = len(Motifs[0])
    profile = CountWithPseudocounts(Motifs)
    
    for i in range(k):
        su=0
        for symbol in "ACGT":
            su += profile[symbol][i]
        for symbol in "ACGT":
            profile[symbol][i] = profile[symbol][i]/su
            
    return profile

def Pr(Text, Profile):
    # insert your code here
    count = 1
    for i, key in enumerate(Text):
        count *= Profile.get(key)[i]
    return count

def Score(Motifs):
    consensus = Consensus(Motifs)
    count = 0
    for motif in Motifs:
        for idx in range(len(motif)):
            if motif[idx] != consensus[idx]:
                count += 1
    return count

def ProfileMostProbablePattern(Text, k, Profile):
    most_prob = Text[0:k] 
    p_max = Pr(Text[0:k], Profile)
    for i in range(1, len(Text) - k + 1):
         if Pr(Text[i:i+k], Profile) > p_max:
                p_max = Pr(Text[i:i+k], Profile)
                most_prob = Text[i:i+k]        
    return most_prob

def Consensus(Motifs):
    k = len(Motifs[0])
    count = CountWithPseudocounts(Motifs)
    consensus = ""
    for j in range(k):
        m = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > m:
                m = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
        
    return consensus
GreedyMotifSearchWithPseudocounts(Dna, k, t)
>TTC ATC TTC ATC TTC

Randomized Motif Finding

import random 
Dna = \
"CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA \
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG \
TAGTACCGAGACCGAAAGAAGTATACAGGCGT \
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC \
AATCCACCAGCTCCACGTGCAATGTTGGCCTA"
Dna = Dna.split()
k = 8
t = 5

def ProfileProbMotif(Dna, k, profile):
    profile_motif = []
    for i in range(len(Dna)):
        profile_motif.append(ProfileMostProbablePattern(Dna[i], k, profile))
    return profile_motif

def RandomMotif(Dna, k, t):
    Motif = []
    Upper = len(Dna[0]) - k
    for i in range(t):
        pos = random.randint(0, Upper)
        Motif.append(Dna[i][pos:pos+k])
    return Motif

def RandomizedMotifsSearch(Dna, k, t):
    M = RandomMotif(Dna, k, t)
    BestMotifs = M
    while True:
        current_profile = ProfileWithPseudocounts(M)
        M = ProfileProbMotif(Dna, k, current_profile)
        if Score(M) < Score(BestMotifs):
            BestMotifs = M
        else:
            return BestMotifs
        
def RepeatedRandomizedMotifSearch(Dna, k, t):
    BestScore = float("inf")
    BestMotifs = []
    for i in range(1000):
        Motif = RandomizedMotifsSearch(Dna, k, t)
        current_score = Score(Motif)
        if current_score < BestScore:
            BestScore = current_score
            BestMotifs = Motif
    return BestMotifs
RepeatedRandomizedMotifSearch(Dna, k, t)
>['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up