LoginSignup
0
1

More than 1 year has passed since last update.

Python 編集距離 O(NP)とビットパラレル法

Last updated at Posted at 2021-03-27

編集距離を算出するONPの実装をしたので備忘録

間違ってたので修正したついでに、処理速度も意識し修正した備忘

多くの情報でsnake関数として外だしされていたけどPythonだと関数呼び出しがバカにならないので
処理本体に組み込んだ。ただfor loopがそもそも遅いのだろう。
しょうがないのでC++で今は実装しなおしする。

O(NP)アルゴリズム

def distance_onp(_a, _b):
    # # thank you https://susisu.hatenablog.com/entry/2017/10/09/134032
    # # required: a.size() >= b.size()
    _A, _B = len(_a), len(_b)
    if _A < _B:
        A, B, a, b = (_A, _B, _a, _b)
    else:
        A, B, a, b = (_B, _A, _b, _a)
    D = B - A
    O = A + 1
    fp = [-1] * (A + B + 3)

    for P in range(B + 1):
        for k in range(-P, D):
            y = max(fp[O + k - 1] + 1, fp[O + k + 1])
            while y - k < A and y < B and a[y - k] == b[y]:
                y += 1
            fp[O + k] = y
        for k in range(D + P, D, -1):
            y = max(fp[O + k - 1] + 1, fp[O + k + 1])
            while y - k < A and y < B and a[y - k] == b[y]:
                y += 1
            fp[O + k] = y
        y = max(fp[O + D - 1] + 1, fp[O + D + 1])
        while y - D < A and y < B and a[y - D] == b[y]:
            y += 1
        fp[O + D] = y

        if fp[O + D] == B:
            break
    return D + 2 * P

########### 修正前はこうだった ##############
def distance_onp(_a, _b):
    # # thank you https://susisu.hatenablog.com/entry/2017/10/09/134032
    # # required: a.size() >= b.size()
    _A, _B = len(_a), len(_b)
    if _A < _B:
        A, B, a, b = (_A, _B, _a, _b)
    else:
        A, B, a, b = (_B, _A, _b, _a)
    D = B - A
    O = A + 1
    fp = [-1] * (A + B + 3)

    def snake(k, y):
        while y - k < A and y < B and a[y - k] == b[y]:
            y += 1
        return y

    for P in range(B + 1):
        for k in range(-P, D):
            fp[O + k] = snake(k, max(fp[O + k - 1] + 1, fp[O + k + 1]))
        for k in range(D + P, D, -1):
            fp[O + k] = snake(k, max(fp[O + k - 1] + 1, fp[O + k + 1]))
        fp[O + D] = snake(D, max(fp[O + D - 1] + 1, fp[O + D + 1]))

        if fp[O + D] == B:
            break
    return D + 2 * P
############################################

ビットパラレル法

ONPよりも文字数制限があるものの速いといわれるビットパラレル法を実装してみた。
置き換えをコスト1と計算するかしないかで2種類の関数を書いた。基本的にWebでよく見られるのは置き換えコスト=1にする方で以下の例だとnoweightの方

ただ自分が欲しいのはONPとも結果が同じになるweightの方でなかなか参考が見つからなかった。

from collections import defaultdict

def levenshtein_distance_bitparallel_weight(a, b):
    # # thank you https://stackoverflow.com/questions/65363769/bitparallel-weighted-levenshtein-distance */
    # # required: a.size() >= b.size()
    N = 64
    A = len(a)
    B = len(b)
    if a == b:
        return 0
    if B == 0:
        return A
    if A == 0:
        return B
    if A == 1 and B == 1:
        return 0 if a == b else 2

    fp=defaultdict(int)
    for i in range(B):
        fp[b[i]] |= 1 << i

    pv = ~0
    dh0 = 0
    dh1 = 0
    dist = A
    ret = 0
    i = 0
    j = 0
    length = 0

    while i < A:
        # Complement Matches
        eq = fp[a[i]] if a[i] in fp else 0 # eq = fp.find(a[i]) != fp.end() ? fp[a[i]] : 0
        ne = ~eq

        # Finding the vertical values. #Find 1s
        initp = (pv & eq)
        dv1 = (((initp + pv) ^ pv) ^ initp)

        # set RemainingDHneg1
        dhN = (pv ^ (dv1 >> 1))
        # combine 1s and Matches
        dv1sm = (dv1 | eq)

        # Find 0s
        i0 = (dh0 & dv1sm)
        dv0 = (((i0 << 1) + dhN) ^ dhN)

        # Find -1s
        dvN = ~(dv1 | dv0)
        dh0 &= ne

        # combine 1s and Matches
        dh1eq = (dh1 | eq)
        # Find 0s
        dh0 = ((dv0 & dh1eq) | (dvN & dh0))
        # Find 1s
        dh1 = (dvN & dh1eq)
        # Find -1s
        pv = ~(dh0 | dh1)

        i += 1

        if i % N == N - 1 or i == A:
            length = B % N if (i == A or B < i + N) else N
            for j in range(length):
                bitmask = 1 << j
                dist -= ((dh0 & bitmask) >> j) * 1 + ((dh1 & bitmask) >> j) * 2 - 1

            if i == A and j < B - 1:
                ret += dist
                for j in range(j, B):
                    k = j % N
                    bitmask = 1 << k
                    dist -= ((dh0 & bitmask) >> k) * 1 + ((dh1 & bitmask) >> k) * 2 - 1
            ret += dist
            dist = A
            dh0 = 0
            dh1 = 0
    return ret

def levenshtein_distance_bitparallel_noweight(a, b):
    A = len(a)
    B = len(b)
    if a == b:
        return 0
    if B == 0:
        return A
    if A == 0:
        return B
    if A == 1 and B == 1:
        return 0 if a == b else 1

    ONE = 1
    OB = ONE << (B - 1)
    fp = defaultdict(int)
    Mv = 0
    Score = B
    Pv = 0

    for i in range(B):
        fp[b[i]] |= ONE << i
        Pv = Pv | (ONE << i)

    for i in range(A):
        Eq = fp[a[i]]
        Xv = Eq | Mv
        Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq
        Ph = Mv | ~(Xh | Pv)
        Mh = Pv & Xh
        if Ph & OB:
            Score += 1
        elif Mh & OB:
            Score -= 1
        Ph <<= ONE
        Pv = (Mh << ONE) | ~(Xv | Ph)
        Mv = Ph & Xv
    return Score


print(levenshtein_distance_bitparallel_weight("cafe", "coffee"))   # -> 4
print(levenshtein_distance_bitparallel_noweight("cafe", "coffee")) # -> 3

0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1