編集距離を算出するONPの実装をしたので備忘録
間違ってたので修正したついでに、処理速度も意識し修正した備忘
多くの情報でsnake関数として外だしされていたけどPythonだと関数呼び出しがバカにならないので
処理本体に組み込んだ。ただfor loopがそもそも遅いのだろう。
しょうがないのでC++で今は実装しなおしする。
O(NP)アルゴリズム
def distance_onp(_a, _b):
# # thank you https://susisu.hatenablog.com/entry/2017/10/09/134032
# # required: a.size() >= b.size()
_A, _B = len(_a), len(_b)
if _A < _B:
A, B, a, b = (_A, _B, _a, _b)
else:
A, B, a, b = (_B, _A, _b, _a)
D = B - A
O = A + 1
fp = [-1] * (A + B + 3)
for P in range(B + 1):
for k in range(-P, D):
y = max(fp[O + k - 1] + 1, fp[O + k + 1])
while y - k < A and y < B and a[y - k] == b[y]:
y += 1
fp[O + k] = y
for k in range(D + P, D, -1):
y = max(fp[O + k - 1] + 1, fp[O + k + 1])
while y - k < A and y < B and a[y - k] == b[y]:
y += 1
fp[O + k] = y
y = max(fp[O + D - 1] + 1, fp[O + D + 1])
while y - D < A and y < B and a[y - D] == b[y]:
y += 1
fp[O + D] = y
if fp[O + D] == B:
break
return D + 2 * P
########### 修正前はこうだった ##############
def distance_onp(_a, _b):
# # thank you https://susisu.hatenablog.com/entry/2017/10/09/134032
# # required: a.size() >= b.size()
_A, _B = len(_a), len(_b)
if _A < _B:
A, B, a, b = (_A, _B, _a, _b)
else:
A, B, a, b = (_B, _A, _b, _a)
D = B - A
O = A + 1
fp = [-1] * (A + B + 3)
def snake(k, y):
while y - k < A and y < B and a[y - k] == b[y]:
y += 1
return y
for P in range(B + 1):
for k in range(-P, D):
fp[O + k] = snake(k, max(fp[O + k - 1] + 1, fp[O + k + 1]))
for k in range(D + P, D, -1):
fp[O + k] = snake(k, max(fp[O + k - 1] + 1, fp[O + k + 1]))
fp[O + D] = snake(D, max(fp[O + D - 1] + 1, fp[O + D + 1]))
if fp[O + D] == B:
break
return D + 2 * P
############################################
ビットパラレル法
ONPよりも文字数制限があるものの速いといわれるビットパラレル法を実装してみた。
置き換えをコスト1と計算するかしないかで2種類の関数を書いた。基本的にWebでよく見られるのは置き換えコスト=1にする方で以下の例だとnoweightの方
ただ自分が欲しいのはONPとも結果が同じになるweightの方でなかなか参考が見つからなかった。
from collections import defaultdict
def levenshtein_distance_bitparallel_weight(a, b):
# # thank you https://stackoverflow.com/questions/65363769/bitparallel-weighted-levenshtein-distance */
# # required: a.size() >= b.size()
N = 64
A = len(a)
B = len(b)
if a == b:
return 0
if B == 0:
return A
if A == 0:
return B
if A == 1 and B == 1:
return 0 if a == b else 2
fp=defaultdict(int)
for i in range(B):
fp[b[i]] |= 1 << i
pv = ~0
dh0 = 0
dh1 = 0
dist = A
ret = 0
i = 0
j = 0
length = 0
while i < A:
# Complement Matches
eq = fp[a[i]] if a[i] in fp else 0 # eq = fp.find(a[i]) != fp.end() ? fp[a[i]] : 0
ne = ~eq
# Finding the vertical values. #Find 1s
initp = (pv & eq)
dv1 = (((initp + pv) ^ pv) ^ initp)
# set RemainingDHneg1
dhN = (pv ^ (dv1 >> 1))
# combine 1s and Matches
dv1sm = (dv1 | eq)
# Find 0s
i0 = (dh0 & dv1sm)
dv0 = (((i0 << 1) + dhN) ^ dhN)
# Find -1s
dvN = ~(dv1 | dv0)
dh0 &= ne
# combine 1s and Matches
dh1eq = (dh1 | eq)
# Find 0s
dh0 = ((dv0 & dh1eq) | (dvN & dh0))
# Find 1s
dh1 = (dvN & dh1eq)
# Find -1s
pv = ~(dh0 | dh1)
i += 1
if i % N == N - 1 or i == A:
length = B % N if (i == A or B < i + N) else N
for j in range(length):
bitmask = 1 << j
dist -= ((dh0 & bitmask) >> j) * 1 + ((dh1 & bitmask) >> j) * 2 - 1
if i == A and j < B - 1:
ret += dist
for j in range(j, B):
k = j % N
bitmask = 1 << k
dist -= ((dh0 & bitmask) >> k) * 1 + ((dh1 & bitmask) >> k) * 2 - 1
ret += dist
dist = A
dh0 = 0
dh1 = 0
return ret
def levenshtein_distance_bitparallel_noweight(a, b):
A = len(a)
B = len(b)
if a == b:
return 0
if B == 0:
return A
if A == 0:
return B
if A == 1 and B == 1:
return 0 if a == b else 1
ONE = 1
OB = ONE << (B - 1)
fp = defaultdict(int)
Mv = 0
Score = B
Pv = 0
for i in range(B):
fp[b[i]] |= ONE << i
Pv = Pv | (ONE << i)
for i in range(A):
Eq = fp[a[i]]
Xv = Eq | Mv
Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq
Ph = Mv | ~(Xh | Pv)
Mh = Pv & Xh
if Ph & OB:
Score += 1
elif Mh & OB:
Score -= 1
Ph <<= ONE
Pv = (Mh << ONE) | ~(Xv | Ph)
Mv = Ph & Xv
return Score
print(levenshtein_distance_bitparallel_weight("cafe", "coffee")) # -> 4
print(levenshtein_distance_bitparallel_noweight("cafe", "coffee")) # -> 3