0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

minimally-sorted BWT variant

Posted at

新しい可逆なBWT風変換を紹介します。BWTと同様に、この変換は全ての最大context(接尾辞木node)をgroup化するため、[1]の意味での接尾辞木によって実現されます。BWTとは異なり、context間の順序付けに辞書式順序は使用しません。全順序は2つの制約から導かれます。

  1. 全ての接尾辞は、末尾のcontextによって最も長い一致の少なくとも1つの隣にあります
  2. 前の接尾辞は後の接尾辞より前に来る

変換の出力はBWTと同様に、変換後の文字列、逆変換用添字の組です。添字は、変換後の文字列の最後の記号の位置を示します。入力文字列の最初の記号はnull contextを持ち、null contextが先頭に配置されるため、入力と変換の最初の記号は常に同じになり、復号は先頭から開始されます。最初の記号は2番目のcontext(null contextの後)を決定するため、入力と変換の2番目の記号も常に同じになります。
変換処理は現状では入力に対して16倍以上の空間を必要とします。圧縮に利用する場合、元祖BWTに比べて圧縮率は1%前後良かったり悪かったりするっぽい。

実装編

例によってJavaScript

/*
Build an enhanced suffix array of a given string in linear time.
For an input text T, it builds an enhancd suffix array in linear time. i-th internal node is represented as a triple (L[i], R[i], D[i]);
L[i] and R[i] is the left/right boundary of the suffix array as SA[L[i]....R[i]-1]
D[i] is the depth of the internal node

@T[0...n-1]  input string.
@SA[0...n-1] input suffix array
@L[0...n-1]  output left boundary of internal
@R[0...n]  output right boundary of internal(R[n] is sentinel)
@D[0...n-1]  output depth of internal
@n The length of the input
*/
function suffixtree(T,SA,L,R,D,n){
	for(var h=0,i=n,p,r=0,s,t,S=[[-1,-1]];i--;)L[SA[i]]=i;
	for(R[0]=R[n]=i;++i<n;)
		if(s=L[i]){for(p=SA[s-1];T[i+h]===T[p+h]&&i+h<n&&p+h<n;)h++;R[s]=h&&h--}
	for(s=S[i=p=0];;s=S[++p]=[i,n-SA[i++]+1]){
		for(h=[i,t=R[i]];s[1]>t;s=S[--p]){
			if(i-s[0]>1)L[r]=s[0],R[r]=i,D[r++]=s[1];
			h[0]=s[0]}
		if(s[1]<t)S[++p]=h;
		if(i===n)break}
	return r
}
// Associate each suffix with a node, and connect nodes to parents. Nodes are in post-order
function findNodesAndParents(SA,L,R,n,D,P,W){
	for(var i=0,p=-1,s,t;i<n;W[i]=R[i]-L[p=i++]){
		for(s=R[i]-1;p>-1&&L[i]<=L[p]&&R[i]>=R[p];p=t){
			for(;s>=R[p];)D[SA[s--]]=i;
			s=L[p]-1;t=P[p];P[p]=i}
		for(;s>=L[i];)D[SA[s--]]=i;
		P[i]=p}
	P[n-1]=-1
}
function computeTransform(T,i,D,N,P,W){
	for(var a,o,p,s;i--;T[o]=i){
		for(a=p=D[i];a>-1&&!N[a];)a=P[a];
		for(s=1,o=N[a]||0;p^a;p=P[p])N[p]=o+s,s=W[p];
		if(a>-1)N[a]+=s}
}
/* minimally-sorted BWT variant
@T{TypedArray |Array} input
@U{TypedArray |Array |null} output
@A{TypedArray |Array |null} suffix array
@l{Number |null} T.length
@k{Number |null} alphabet size
@sorter{function |null} suffix sorter
	sorter(t,s,l,k)
	@t{TypedArray |Array} input
	@s{TypedArray |Array |null} output
	@l{Number |null} t.length
	@k{Number |null} alphabet size
@return: first position for decoding
*/
function minSorting(T,U,A,l,k,sorter){
	l=l||T.length;k=k>>>0||256;T.reverse();
	var a=l<256?1:l<65536?2:l>>24?4:3,SA=A||new Int32Array(l),D=new Int32Array(l),L=new Int32Array(U?l:l+a),R=new Int32Array(l+1),n;
	if(sorter)sorter(T,SA,l,k);
	else{for(n=l;n;)SA[--n]=n;
		SA.sort((a,b,c,d)=>{
			for(c=a<b?l-b:l-a;c--;)if(d=T[a++]-T[b++])return d;return b-a
		})
	}
	if(n=suffixtree(T,SA,L,R,D,l)){
		var P=new Int32Array(n*3),N=P.subarray(n),W=N.subarray(n);
		findNodesAndParents(SA,L,R,n,D,P,W);R=null;
		computeTransform(SA,l,D,N,P,W);D=N=P=W=null;
		for(L=U||L,L[k=n=0]=T[l-1];k<l;)if(P=SA[k++])L[++n]=T[--P],P||(D=n)}
	else L=U||L,L[0]=T[D=0];
	L.p=D;return U?D:L;
}
function bsearch(C,I,p,c){
	for(var l,n=C[c],m=c=C[c+1];m>n;I[l]<p?n=++l:m=--l)
		if(I[l=n+m>>1]===p)return l;
	return n<c&&I[n]<p?n+1:n
}
/* decoder
@A input
@x primary index
@k alphabet size
*/
function minSorted(In,x,k){
	k=k>>>0||255;
	var z=In.length,c=z,d,e=In[x>>>=0],f=0,h,i=0,m,n=0,o=0,p,s,t=1,nn,nc,C=new Uint32Array(++k),I=[],O=[],T=[],P=[-1],S=[0],E=[],M=[],N=[1];
	for(;c;)++C[In[--c]];
	for(;c<z;c+=s)s=C[n],T[n]=C[n++]=c;
	for(C[n]=c;f<z;)I[T[In[f]]++]=f++;
	for(E[T.length=0]=z;o<z;m>0?N[P[m]=d]+=nc:N[d]++){
		O[o++]=c=In[i];h=c*0x100000000;
		for(n=M[i]|=d=0;n>-1;n=P[n])
			if(T[m=n+h]){d=T[m];break}
		s=n;p=N[d];m=nc=0;
		for(n=M[i];n^s;n=P[n]){
			i=S[n];k=E[n];
			if(k-i<32)for(f=0;i<k;)In[i++]^c||f++;
			else f=bsearch(C,I,k,c)-bsearch(C,I,i,c);
			c^e||S[n]>x||E[n]<=x||f--;
			if(!m&&f>1||m>0&&f-nc>0){
				for(i=nc;i<f;)M[p+i++]=t;
				S[t]=p;E[t]=p+f;N[T[n+h]=nn=t++]=m>0?p+nc:p+1;
				if(m>0)P[m]=nn;
				m=nn;nc=f
			}
		}i=N[d]
	}return O
}
sample
let T=Uint8Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt()),
	l=T.length, U=new Uint8Array(l),
	p=minSorting(T,U),//encode
	t=minSorted(U,p);//decode
document.write(String.fromCharCode(...U),"<hr>",String.fromCharCode(...t))

参考文献

某掲示板
論文

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?