0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

接尾辞配列召喚program SACA-K

0
Posted at

SACA-Kとは線形時間で接尾辞配列を構築するprogramです。しかし最新のsaisほど高速ではないし、使い勝手も悪いという有り様…。今更こんな骨董品紹介せんでも…。
どこが使いにくいかと言うと、接尾辞配列の構築元となる文字列にNULL文字が1個だけ末尾に必要という点です。この点を強引に何とかします。

実装編

本家はC++で書かれていますが、これは例によってJavaScript版です。ついでなのでBWT構築関数も搭載。

/*
saca_k1_127(A)
@A: Array/Int8Array. item must be number 1-127
@return: suffix array

saca_k0_255(A)
@A: Array/Int16Array. item must be number 0-255
@return: suffix array

saca_k.BWTe(T,U)
@T: Array. item must be number 0-255
@U: Array/TypedArray. the length must be the same as T
@return:
	U is Array/Typed Array: primary index
	otherwise: bwt array(last several items are primary index)
*/
const EMPTY=1<<31;
function calculate_bucket_start(T,B,K,n){
	B.fill(0,0,K);
	for(var i=n,b,c=0;i;)B[T[--i]]++;
	for(;i<K;c+=b)b=B[i],B[i++]=c
}
//Used for calculating the end of the bucket
function calculate_bucket_end(T,B,K,n){
	B.fill(0,0,K);
	for(var i=n,c=-1;i;)B[T[--i]]++;
	for(;i<K;B[i++]=c)c+=B[i]
}
//Used for inducing the sort of the L type suffixes
function induced_sort_L_0(T,SA,B,n){
	for(let i=0,j,c,s;i<n;)
		if((s=SA[i++])>0){
			c=T[j=s-1];
			if(c>=T[s])SA[B[c]++]=j
		}
}
//Used for inducing the sort of the S type suffixes
function induced_sort_S_0(T,SA,B,n){
	for(let i=n,j,c,d,s;i;)
		if((s=SA[--i])>0){
			c=T[j=s-1];d=T[s];
			if(c<d||c===d&&B[c]<i)SA[B[c]--]=j
		}
}
//Used for shifting the elements in given suffix array from the start in the oposite of given directon
function shift(SA,start,direction){
	for(var i=start,p=SA[i],a;a=SA[i-=direction],SA[i]=p,a>=0;)p=a;
	SA[start]=EMPTY;
	return i
}
//Used for storing of the suffix.Type of suffix is determined by direction
function store_suffix(SA,n,i,c,d){
	let s=SA[c];
	if(s===EMPTY){d+=c;
		if(d<n&&d>0&&SA[d]===EMPTY)SA[d]=i,SA[c]=-1;
		else SA[c]=i;
		return-1
	}
	if(s>=0){
		s=shift(SA,c,d);d+=c;
		if(d<n&&d>0&&SA[d]===EMPTY)SA[d]=i,SA[c]=-1;
		else SA[c]=i;
		return s
	}
	if(SA[s=c-s*d+d]===EMPTY){
		SA[s]=i;--SA[c];
		return-1
	}
	n=shift(SA,s-=d,d);SA[s]=i;
	return n
}
//Used for inducing the sort of the S type suffixes
function induced_sort_S_1(T,SA,n){
	for(let i=n,s;i;)
		if((s=SA[--i])>0){
			let j=SA[i]-1,c=T[j],d=T[s];
			if(c<d||c===d&&c>i)s=store_suffix(SA,n,j,c,-1),s^-1&&s>i&&i++
		}
}
//Used for inducing the sort of the L type suffixes
function induced_sort_L_1(T,SA,n){
	for(var i=0,j,s;i<n;++i)
		if((s=SA[i])>0){
			let c=T[j=s-1],d=T[s];
			if(c>=d){
				c=store_suffix(SA,n,j,c,1);
				if((s>n||(d<(s=T[s+1])||d===s&&d>=i))&&i)SA[i]=EMPTY;
				c^-1&&c<i&&i--
			}
		}
	for(i=0;++i<n;)
		if((s=SA[i])<0&&s^EMPTY){
			for(j=i,s=i-s;j<s;)SA[j]=SA[++j];
			SA[s]=EMPTY
		}
}
//lms naming
function reduce_string(SA,T,n,lms_count){
	let a=n-(n>>1),b=n-1,i=a,l,m=1,p,r=0,s,u=1;
	for(;i<n;)SA[i++]=EMPTY;
	for(SA[SA[b]=0]=i=1;i<lms_count;b=s){
		for(l=2,p=s=SA[i++];T[++p]>0;)l++;
		p=a+(s-(n&1)>>1);
		if(m===l)for(;m--&&T[m+b]===T[m+s];);
		if(m<0)SA[SA[p]=r]++;
		else SA[SA[p]=r+=SA[r]]=1,u++;
		m=l
	}
	for(l=r=0,i=n-1;i>a;)SA[--i]^EMPTY?SA[i+r]=SA[i]:r++;
	for(p=SA[i=n-2],n-=lms_count;i>n;p=s){
		s=SA[--i];
		if(l=s<p||s===p&&l)SA[i]+=SA[SA[i]]-1
	}return u
}
function count_and_set_lms(T,n){
	for(var i=n-2,c=1,u,t;i>0;u=t){
		t=T[i]>T[--i]||T[i]===T[i+1]&&u;
		if(!t&&u)T[i+1]=-T[i+1],c++
	}return c
}
function unset_lms(T,n){
	for(let i=n-2;i>0;)if(T[--i]<0)T[i]=-T[i]
}
function compact_lms(SA,T,n){
	SA[0]=n-1;
	for(let i=0,j=1,s;++i<n;)if(T[s=SA[i]]<0)SA[j++]=s
}
function induce_lms(SA,U,lms_count,T,n){
	let i=n-2,l=lms_count-2,u,c=T[i];
	for(;i>0;){
		let d=T[--i],t=d<c||d===c&&u;
		if(!t&&u)U[l--]=i+1;
		u=t;c=d
	}
	for(i=1;i<lms_count;)SA[i]=U[SA[i++]]
}
//saca-k
function saca_k(T,SA,K,n){
	let B=new Int32Array(K);
	calculate_bucket_end(T,B,K,n);
	//Used for inducing the sort of the LMS substrings
	SA.fill(EMPTY,0,n);SA[0]=n-1;
	for(let i=n-2,c=T[i],d,u,t;i>0;c=d){
		d=T[--i],t=d<c||d===c&&u;
		if(!t&&u)SA[B[c]--]=i+1;
		u=t
	}
	calculate_bucket_start(T,B,K,n);
	induced_sort_L_0(T,SA,B,n);
	calculate_bucket_end(T,B,K,n);
	induced_sort_S_0(T,SA,B,n);
	let c=count_and_set_lms(T,n);
	compact_lms(SA,T,n);
	let r=reduce_string(SA,T,n,c);
	unset_lms(T,n);
	let U=SA.subarray(n-c);
	if(c>r)saca_k_1(U,SA,c);
	else for(;r;)SA[U[--r]]=r;
	induce_lms(SA,U,c,T,n);
	calculate_bucket_end(T,B,K,n);
	//Used for putting the sorted LMS-substrings in ther correct bucket in suffix array
	SA.fill(EMPTY,c,n);SA[0]=n-1;
	for(let i=c,j,b;i>1;SA[B[b]--]=j)b=T[j=SA[--i]],SA[i]=EMPTY;

	calculate_bucket_start(T,B,K,n);
	induced_sort_L_0(T,SA,B,n);
	calculate_bucket_end(T,B,K,n);
	induced_sort_S_0(T,SA,B,n)
}
//Used for deeper recursion calls in generating suffix array.
function saca_k_1(T,SA,n){
	//Used for inducing the sort of the LMS substrings
	SA.fill(EMPTY,0,n);
	let i=n-2,c=T[i],d,s,t;
	for(SA[0]=n-1;i>0;c=d)
		d=T[--i],t=d<c||d===c&&s,
		!t&&s&&store_suffix(SA,n,i+1,c,-1),s=t
	for(i=0;++i<n;)
		if((s=SA[i])<0&&s^EMPTY){
			for(s+=c=i;c>s;)SA[c]=SA[--c];
			SA[c]=EMPTY
		}

	induced_sort_L_1(T,SA,n);
	induced_sort_S_1(T,SA,n);
	c=count_and_set_lms(T,n);
	compact_lms(SA,T,n);
	d=reduce_string(SA,T,n,c);
	unset_lms(T,n);
	t=SA.subarray(n-c);
	if(c>d)saca_k_1(t,SA,c);
	else for(;d;)SA[t[--d]]=d;
	induce_lms(SA,t,c,T,n);
	//Used for putting the sorted LMS-substrings in ther correct bucket in suffix array
	SA.fill(EMPTY,i=c,n);SA[0]=n-1;
	for(d=t=0;i>1;SA[c^d?t=d=c:--t]=s)
		c=T[s=SA[--i]],SA[i]=EMPTY;

	induced_sort_L_1(T,SA,n);
	induced_sort_S_1(T,SA,n)
}
function saca_k1_127(A){
	for(var z=A.length,a=z++,T=new Int8Array(z),S=new Int32Array(z);a;)T[--a]=A[a];
	saca_k(T,S,128,z);
	return S.subarray(1)
}
function saca_k0_255(A){
	for(var z=A.length,a=z++,T=new Int16Array(z),S=new Int32Array(z);a;)T[--a]=A[a]+1;
	saca_k(T,S,257,z);
	return S.subarray(1)
}
saca_k.BWTe=function(T,U){
	var a=U instanceof Array||U&&U.buffer,i=0,n=T.length,p,SA=saca_k0_255(T),B=a?U:[];
	for(B[0]=T[n-1];p=SA[i++];)B[i]=T[--p];
	for(p=i;i<n;)B[i]=T[--SA[i++]];
	if(a)return p;--p;
	for(a=n<257?1:n<65537?2:3;a--;p>>=8)B[i++]=p&255;
	return B
};
saca_k.BWTd=function(A,p,k){
	k=k>>>0||255;
	var a=0,b=++k,c,B=A.length,s=0,C=[],P=[];
	if(!(p>-1))p=B<258?A[--B]:B<65539?A[--B]<<8|A[--B]:A[--B]<<16|A[--B]<<8|A[--B],A.length=B,p++;
	while(b)C[--b]=0;
	while(b<B)P[b]=C[A[b++]]++;
	for(B=Array(b);s<b;s+=c)c=C[a],C[a++]=s;
	for(c=0;b;c<p&&c++)c=P[c]+C[B[--b]=A[c]];
	return B
}

とりあえず接尾辞配列が欲しければsaca_k0_255の第一引数に文字列を数値配列化したArrayもしくはTypedArrayを与えておけば良い。

test
let T=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt()),
	bwt=[],
	sa=saca_k0_255(T),
	idx=saca_k.BWTe(T,bwt),
	unbwt=saca_k.BWTd(bwt,idx);
document.write("SA ",sa,"<br>idx",idx,"<br>BWT ",bwt,"<br>unBWT ",unbwt)

参考文献

ge-nong

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?