SACA-Kとは線形時間で接尾辞配列を構築するprogramです。しかし最新のsaisほど高速ではないし、使い勝手も悪いという有り様…。今更こんな骨董品紹介せんでも…。
どこが使いにくいかと言うと、接尾辞配列の構築元となる文字列にNULL文字が1個だけ末尾に必要という点です。この点を強引に何とかします。
実装編
本家はC++で書かれていますが、これは例によってJavaScript版です。ついでなのでBWT構築関数も搭載。
/*
saca_k1_127(A)
@A: Array/Int8Array. item must be number 1-127
@return: suffix array
saca_k0_255(A)
@A: Array/Int16Array. item must be number 0-255
@return: suffix array
saca_k.BWTe(T,U)
@T: Array. item must be number 0-255
@U: Array/TypedArray. the length must be the same as T
@return:
U is Array/Typed Array: primary index
otherwise: bwt array(last several items are primary index)
*/
const EMPTY=1<<31;
function calculate_bucket_start(T,B,K,n){
B.fill(0,0,K);
for(var i=n,b,c=0;i;)B[T[--i]]++;
for(;i<K;c+=b)b=B[i],B[i++]=c
}
//Used for calculating the end of the bucket
function calculate_bucket_end(T,B,K,n){
B.fill(0,0,K);
for(var i=n,c=-1;i;)B[T[--i]]++;
for(;i<K;B[i++]=c)c+=B[i]
}
//Used for inducing the sort of the L type suffixes
function induced_sort_L_0(T,SA,B,n){
for(let i=0,j,c,s;i<n;)
if((s=SA[i++])>0){
c=T[j=s-1];
if(c>=T[s])SA[B[c]++]=j
}
}
//Used for inducing the sort of the S type suffixes
function induced_sort_S_0(T,SA,B,n){
for(let i=n,j,c,d,s;i;)
if((s=SA[--i])>0){
c=T[j=s-1];d=T[s];
if(c<d||c===d&&B[c]<i)SA[B[c]--]=j
}
}
//Used for shifting the elements in given suffix array from the start in the oposite of given directon
function shift(SA,start,direction){
for(var i=start,p=SA[i],a;a=SA[i-=direction],SA[i]=p,a>=0;)p=a;
SA[start]=EMPTY;
return i
}
//Used for storing of the suffix.Type of suffix is determined by direction
function store_suffix(SA,n,i,c,d){
let s=SA[c];
if(s===EMPTY){d+=c;
if(d<n&&d>0&&SA[d]===EMPTY)SA[d]=i,SA[c]=-1;
else SA[c]=i;
return-1
}
if(s>=0){
s=shift(SA,c,d);d+=c;
if(d<n&&d>0&&SA[d]===EMPTY)SA[d]=i,SA[c]=-1;
else SA[c]=i;
return s
}
if(SA[s=c-s*d+d]===EMPTY){
SA[s]=i;--SA[c];
return-1
}
n=shift(SA,s-=d,d);SA[s]=i;
return n
}
//Used for inducing the sort of the S type suffixes
function induced_sort_S_1(T,SA,n){
for(let i=n,s;i;)
if((s=SA[--i])>0){
let j=SA[i]-1,c=T[j],d=T[s];
if(c<d||c===d&&c>i)s=store_suffix(SA,n,j,c,-1),s^-1&&s>i&&i++
}
}
//Used for inducing the sort of the L type suffixes
function induced_sort_L_1(T,SA,n){
for(var i=0,j,s;i<n;++i)
if((s=SA[i])>0){
let c=T[j=s-1],d=T[s];
if(c>=d){
c=store_suffix(SA,n,j,c,1);
if((s>n||(d<(s=T[s+1])||d===s&&d>=i))&&i)SA[i]=EMPTY;
c^-1&&c<i&&i--
}
}
for(i=0;++i<n;)
if((s=SA[i])<0&&s^EMPTY){
for(j=i,s=i-s;j<s;)SA[j]=SA[++j];
SA[s]=EMPTY
}
}
//lms naming
function reduce_string(SA,T,n,lms_count){
let a=n-(n>>1),b=n-1,i=a,l,m=1,p,r=0,s,u=1;
for(;i<n;)SA[i++]=EMPTY;
for(SA[SA[b]=0]=i=1;i<lms_count;b=s){
for(l=2,p=s=SA[i++];T[++p]>0;)l++;
p=a+(s-(n&1)>>1);
if(m===l)for(;m--&&T[m+b]===T[m+s];);
if(m<0)SA[SA[p]=r]++;
else SA[SA[p]=r+=SA[r]]=1,u++;
m=l
}
for(l=r=0,i=n-1;i>a;)SA[--i]^EMPTY?SA[i+r]=SA[i]:r++;
for(p=SA[i=n-2],n-=lms_count;i>n;p=s){
s=SA[--i];
if(l=s<p||s===p&&l)SA[i]+=SA[SA[i]]-1
}return u
}
function count_and_set_lms(T,n){
for(var i=n-2,c=1,u,t;i>0;u=t){
t=T[i]>T[--i]||T[i]===T[i+1]&&u;
if(!t&&u)T[i+1]=-T[i+1],c++
}return c
}
function unset_lms(T,n){
for(let i=n-2;i>0;)if(T[--i]<0)T[i]=-T[i]
}
function compact_lms(SA,T,n){
SA[0]=n-1;
for(let i=0,j=1,s;++i<n;)if(T[s=SA[i]]<0)SA[j++]=s
}
function induce_lms(SA,U,lms_count,T,n){
let i=n-2,l=lms_count-2,u,c=T[i];
for(;i>0;){
let d=T[--i],t=d<c||d===c&&u;
if(!t&&u)U[l--]=i+1;
u=t;c=d
}
for(i=1;i<lms_count;)SA[i]=U[SA[i++]]
}
//saca-k
function saca_k(T,SA,K,n){
let B=new Int32Array(K);
calculate_bucket_end(T,B,K,n);
//Used for inducing the sort of the LMS substrings
SA.fill(EMPTY,0,n);SA[0]=n-1;
for(let i=n-2,c=T[i],d,u,t;i>0;c=d){
d=T[--i],t=d<c||d===c&&u;
if(!t&&u)SA[B[c]--]=i+1;
u=t
}
calculate_bucket_start(T,B,K,n);
induced_sort_L_0(T,SA,B,n);
calculate_bucket_end(T,B,K,n);
induced_sort_S_0(T,SA,B,n);
let c=count_and_set_lms(T,n);
compact_lms(SA,T,n);
let r=reduce_string(SA,T,n,c);
unset_lms(T,n);
let U=SA.subarray(n-c);
if(c>r)saca_k_1(U,SA,c);
else for(;r;)SA[U[--r]]=r;
induce_lms(SA,U,c,T,n);
calculate_bucket_end(T,B,K,n);
//Used for putting the sorted LMS-substrings in ther correct bucket in suffix array
SA.fill(EMPTY,c,n);SA[0]=n-1;
for(let i=c,j,b;i>1;SA[B[b]--]=j)b=T[j=SA[--i]],SA[i]=EMPTY;
calculate_bucket_start(T,B,K,n);
induced_sort_L_0(T,SA,B,n);
calculate_bucket_end(T,B,K,n);
induced_sort_S_0(T,SA,B,n)
}
//Used for deeper recursion calls in generating suffix array.
function saca_k_1(T,SA,n){
//Used for inducing the sort of the LMS substrings
SA.fill(EMPTY,0,n);
let i=n-2,c=T[i],d,s,t;
for(SA[0]=n-1;i>0;c=d)
d=T[--i],t=d<c||d===c&&s,
!t&&s&&store_suffix(SA,n,i+1,c,-1),s=t
for(i=0;++i<n;)
if((s=SA[i])<0&&s^EMPTY){
for(s+=c=i;c>s;)SA[c]=SA[--c];
SA[c]=EMPTY
}
induced_sort_L_1(T,SA,n);
induced_sort_S_1(T,SA,n);
c=count_and_set_lms(T,n);
compact_lms(SA,T,n);
d=reduce_string(SA,T,n,c);
unset_lms(T,n);
t=SA.subarray(n-c);
if(c>d)saca_k_1(t,SA,c);
else for(;d;)SA[t[--d]]=d;
induce_lms(SA,t,c,T,n);
//Used for putting the sorted LMS-substrings in ther correct bucket in suffix array
SA.fill(EMPTY,i=c,n);SA[0]=n-1;
for(d=t=0;i>1;SA[c^d?t=d=c:--t]=s)
c=T[s=SA[--i]],SA[i]=EMPTY;
induced_sort_L_1(T,SA,n);
induced_sort_S_1(T,SA,n)
}
function saca_k1_127(A){
for(var z=A.length,a=z++,T=new Int8Array(z),S=new Int32Array(z);a;)T[--a]=A[a];
saca_k(T,S,128,z);
return S.subarray(1)
}
function saca_k0_255(A){
for(var z=A.length,a=z++,T=new Int16Array(z),S=new Int32Array(z);a;)T[--a]=A[a]+1;
saca_k(T,S,257,z);
return S.subarray(1)
}
saca_k.BWTe=function(T,U){
var a=U instanceof Array||U&&U.buffer,i=0,n=T.length,p,SA=saca_k0_255(T),B=a?U:[];
for(B[0]=T[n-1];p=SA[i++];)B[i]=T[--p];
for(p=i;i<n;)B[i]=T[--SA[i++]];
if(a)return p;--p;
for(a=n<257?1:n<65537?2:3;a--;p>>=8)B[i++]=p&255;
return B
};
saca_k.BWTd=function(A,p,k){
k=k>>>0||255;
var a=0,b=++k,c,B=A.length,s=0,C=[],P=[];
if(!(p>-1))p=B<258?A[--B]:B<65539?A[--B]<<8|A[--B]:A[--B]<<16|A[--B]<<8|A[--B],A.length=B,p++;
while(b)C[--b]=0;
while(b<B)P[b]=C[A[b++]]++;
for(B=Array(b);s<b;s+=c)c=C[a],C[a++]=s;
for(c=0;b;c<p&&c++)c=P[c]+C[B[--b]=A[c]];
return B
}
とりあえず接尾辞配列が欲しければsaca_k0_255の第一引数に文字列を数値配列化したArrayもしくはTypedArrayを与えておけば良い。
test
let T=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt()),
bwt=[],
sa=saca_k0_255(T),
idx=saca_k.BWTe(T,bwt),
unbwt=saca_k.BWTd(bwt,idx);
document.write("SA ",sa,"<br>idx",idx,"<br>BWT ",bwt,"<br>unBWT ",unbwt)