関連 http://qiita.com/mpyw/items/a4495d476ea9ffe54e16
関連 http://qiita.com/7of9/items/ca66015e2933202decc9
関連 http://qiita.com/7of9/items/f2513d05e65eb3b201c2
Damerau–Levenshtein distance関連
v0.2
参考 https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
stackoverflowのコードを変更して、4つの操作において特定の文字列に対するコストの緩急をつけようとしている。
damlev.c
/*
v0.2 2016 Oct. 21
- use cost from functions given below
- add calc_cost_transposition()
- add calc_cost_substitution()
- add calc_cost_deletion()
- add calc_cost_insertion()
- increase size of DA[] for 128x128
v0.1 2016 Oct. 19
- branched from stackoverflow post(*1)
[*1] http://stackoverflow.com/questions/10727174/damerau-levenshtein-distance-edit-distance-with-transposition-c-implementation
*/
# define USE_COST_FUNC
# define d(i,j) dd[(i) * (m+2) + (j) ]
# define min(x,y) ((x) < (y) ? (x) : (y))
# define min3(a,b,c) ((a)< (b) ? min((a),(c)) : min((b),(c)))
# define min4(a,b,c,d) ((a)< (b) ? min3((a),(c),(d)) : min3((b),(c),(d)))
int dprint(int* dd, int n,int m){
int i,j;
for (i=0; i < n+2;i++){
for (j=0;j < m+2; j++){
printf("%02d ",d(i,j));
}
printf("\n");
}
printf("\n");
return 0;
}
int calc_cost_insertion(char s)
{
if (s == 'c' || s == 'r') {
return 334;
}
return 1;
}
int calc_cost_deletion(char s)
{
if (s == 'c' || s == 'r') {
return 334;
}
return 1;
}
int calc_cost_substitution(char s, char t)
{
if (s == 'c' && t == 'r') {
return 334;
}
return 1;
}
int calc_cost_transposition(char s, char t)
{
if (s == 'c' && t == 'r') {
return 334;
}
return 1;
}
int dldist2(char *s, char* t, int n, int m) {
int *dd;
int i, j, i1,j1,cost,DB;
int result_cost;
int INFINITY = n + m;
// int DA[256 * sizeof(int)];
int DA[128 * 128 * sizeof(int)];
int cost_ins; // insert_costs
int cost_del; // delete_costs
int cost_sub; // substitute_costs
int cost_tra; // transpose_costs
memset(DA, 0, sizeof(DA));
if (!(dd = (int*) malloc((n+2)*(m+2)*sizeof(int)))) {
return -1;
}
d(0,0) = INFINITY;
for(i = 0; i < n+1; i++) {
d(i+1,1) = i ;
d(i+1,0) = INFINITY;
}
for(j = 0; j<m+1; j++) {
d(1,j+1) = j ;
d(0,j+1) = INFINITY;
}
// dprint(dd,n,m);
for(i = 1; i< n+1; i++) {
DB = 0;
for(j = 1; j< m+1; j++) {
i1 = DA[t[j-1]];
j1 = DB;
cost = ((s[i-1]==t[j-1])?0:1);
cost_ins = calc_cost_insertion(s[i+1-1]); // or t[]???
cost_del = calc_cost_deletion(s[i-1]); // or t[]???
cost_sub = calc_cost_substitution(s[i-1], t[j-1]);
cost_tra = calc_cost_transposition(s[i1 - 1], t[j1 - 1]);
if(cost==0) DB = j;
# ifdef USE_COST_FUNC
d(i+1,j+1) =
min4(
d(i,j) + cost_sub + cost, // substitution
d(i+1,j) + cost_ins, // insertion
d(i,j+1) + cost_del, // deletion
d(i1,j1) + cost_tra // transposition
);
# else // USE_COST_FUNC
d(i+1,j+1) =
min4(d(i,j)+cost, // substitution
d(i+1,j) + 1, // insertion
d(i,j+1)+1, // deletion
d(i1,j1) + (i-i1-1) + 1 + (j-j1-1)); // transposition
# endif // USE_COST_FUNC
}
DA[s[i-1]] = i;
// dprint(dd,n,m);
}
result_cost = d(n+1,m+1);
free(dd);
return result_cost;
}
int main()
{
char *inpstr = "composr"; // input
char *expstr = "composer"; // expected
int dist;
dist = dldist2(inpstr, expstr, strlen(inpstr), strlen(expstr));
printf("%d\n", dist);
return 0;
}
実装の確かさは未確認。
疑問点
- insertionのcostチェックするのはinput文字列か、expected文字列か?
- deletionのcostチェックするのはinput文字列か、expected文字列か?
-
d(i,j) + cost_sub + cost, // substitution
において+ cost
は余分か?