BWT系圧縮でかつて猛威を振るわなかったGRZipIIというprogramの紹介。と言うか模倣しようかなという企画第1弾。
原理
BlockSort -> MTF -> RLE -> Range符号 という流れで圧縮。MTFにより小さい値だらけになったら、まず記号を2以下か3以上かで分けます。2以下ならそのまま符号化。3以上なら3を符号化した後、その記号を0~126に変換してGamma符号化。記号の後は連長をGamma符号化。本家はまだLZP等で加工していますが省略。
実装
function compare(R,a,b,i,n,r){
for(n-=a<b?b:a;i<n&&!(r=R[a+i]-R[b+i]);)i+=2;return r||b-a
}
function subsort(R,A,l,h,d,n){
if(10<h-l)
for(var i=l+2,j,m=i,t;i<h;A[j]=t)
for(t=A[j=++i];m<j&&compare(R,t,A[j-3],d,n)<0;)A[j]=A[j-=3];
for(i=l;i<h;A[j]=t)
for(t=A[j=++i];l<j&&compare(R,t,A[j-1],d,n)<0;)A[j--]=A[j]
}
function median(R,A,a,b,c,d,z){
var r=A[a]+d,s=A[b]+d;
r=r<z?R[r]:-1,s=s<z?R[s]:-1,d+=A[c],d=d<z?R[d]:-1;
return r<s?s<d?b:r<d?c:a:r<d?a:s<d?c:b
}
function getPivot(R,A,r,l,h,d,z){
if(r<65)var m=l+(r>>1),p;
else r>>=3,
l=median(R,A,l,p=l+r,p+=r,d,z),
m=median(R,A,p+=r,p+=r,p+=r,d,z),
h=median(R,A,p+=r,p+=r,h,d,z);
p=A[m=median(R,A,l,m,h,d,z)];A[m]=A[l];A[l]=p;
return R[p+d]
}
function mqsort(R,I,l,h,d,n,L,H,D){
for(var a,b,i,j,o,p,r,s=0,t;;){
if(h-l<20){
for(l<h&&subsort(R,I,l,h,d,n);l<=h;)R[I[l]]=l++;
if(!s)break;
l=L[--s];h=H[s];d=D[s];
continue}
p=getPivot(R,I,h-l+1,l,h,d,n);
for(i=a=l,j=b=h;;I[j--]=t){
for(;i<=j&&(r=((t=I[i]+d)<n?R[t]:-1))<=p;i++)
r^p||(I[i]=I[a],I[a++]=t-d);
for(;i<=j&&(r=((t=I[j]+d)<n?R[t]:-1))>=p;j--)
r^p||(I[j]=I[b],I[b--]=t-d);
if(i>j)break;
t=I[i],I[i++]=I[j]}
if((p=a-l)<(r=i-a))r=p;o=j;
for(p=l;r--;I[o--]=t)t=I[p],I[p++]=I[o];
if((p=h-b)<(r=b-j))r=p;o=h;
for(p=i;r--;I[o--]=t)t=I[p],I[p++]=I[o];
i+=l-a;j-=~h+b;
if(j<=h)L[s]=j,H[s]=h,D[s++]=d;
if(i<j)L[s]=i,H[s]=j-1,D[s++]=d+2;
h=i-1}
}
//sort type B*
function sortTypeBstr(B,I,C,R,n){
for(var A=C.slice(),p=1,r,i=n-1,j=0|Math.sqrt(n*2)+1,L=new Int32Array(j*3+3),H=L.subarray(j+1),D=H.subarray(j+1);i--;)
if(r=B[i]-B[i+1])
r<0&&p>0&&(I[A[B[i]<<8|B[i+1]]++]=i),p=r;
for(;i<255;)
for(j=++i;j<255;p-r>1&&mqsort(R,I,r,--p,2,n,L,H,D))
p=A[r=i<<8|++j],r=C[r]
}
//set type A,B
function setTypeAB(B,I,C,n){
var i=255,j,p,A=new Uint32Array(256),E=A.slice();
for(A[i]=n;j=i;A[i]=E[i]){
for(;j;)E[--j]=C[j<<8|i];
for(j=C[i--<<8];j>E[i];)
if((p=I[--j])>0&&B[p]>=B[--p])
I[--E[B[p]]]=p}
for(I[C[B[n-1]<<8]++]=n-1;j<n;)
if((i=I[j++])>0&&B[i]<=B[--i]&&C[p=B[i]<<8|B[i+1]]<A[B[i]])
I[C[p]++]=i
}
/* suffix sort
@T: input (Uint8Array / Array)
@A: suffix array (Int32Array / Array / null)
@n: input size (Number / null)
*/
function I2SRsort(T,A,n){n=n||T.length;
var I=A||new Int32Array(n);
if(n<2)return n?I[0]=0:0,I;
if(n<3)return I[n=0|T[0]<T[1]]=1,I[1-n]=0,I;
if(n<512&&!A)return I.map((a,b)=>b).sort((a,b,c,d)=>{for(c=a<b?n-b:n-a;c--;)if(d=T[a++]-T[b++])return d;return b-a});
for(var i=n,c,C=new Uint32Array(65537),R=I.slice();i;)C[c|(c=T[--i])<<8]++;
for(n=0;i<65537;n+=c)c=C[i],C[i++]=n;
for(i=n,c=0;i;)R[--i]=C[(c|(c=T[i])<<8)+1]-1;
sortTypeBstr(T,I,C,R,n);setTypeAB(T,I,C,n);
return I
}
/* BWT
@T: input (Uint8Array / Array)
@U: output (Uint8Array / Array / null)
@n: input size (Number / null)
*/
function BWTe(T,U,n){n=n||T.length;
var i=0,a=n<257?1:n<65537?2:n<16777217?3:4,p,SA=I2SRsort(T,0,n),B=U||new Uint8Array(n+a);
for(B[0]=T[n-1];p=SA[i++];)B[i]=T[--p];
for(p=i;i<n;)B[i]=T[SA[i++]-1];
if(U)return p;--p;
for(;a--;p>>=8)B[i++]=p&255;
return B
}
//setImmediateもどき
(function(a){
var F=[],hit="\0";
a.wait0=function(f){
F[F.length]=f;a.postMessage(hit,"*")
};
a.onmessage=function(e){
if(e.source===a&&e.data===hit)
e.stopPropagation(),F[0]&&F.shift()()
}
})(this);
const R2N=new Uint8Array(256),R2M=new Uint8Array(256),R2P=new Uint8Array(257);
// initialize
!function(N,M,P,a,b,c){for(P[4]=1;b<255;N[++b]=c-2)for(c=0;b>>++c;);for(;--b>1;M[b+1]=1<<c-2)for(c=0;b>>++c;);for(;b<7;)for(c=256>>b++;c;)P[a--]=--c}(R2N,R2M,R2P,256,4);
function update03(M,V,U,r){
M[r]+=2,U[r]+=2,V[r]+=2;
if((M[4]+=2)>58)M[4]=(M[0]-=M[0]>>1)+(M[1]-=M[1]>>1)+(M[2]-=M[2]>>1)+(M[3]-=M[3]>>1);
if((V[4]+=2)>62)V[4]=(V[0]>>=1)+(V[1]>>=1)+(V[2]>>=1)+(V[3]>>=1);
if((U[4]+=2)>204)U[4]=(U[0]>>=1)+(U[1]>>=1)+(U[2]>>=1)+(U[3]>>=1)
}
/* compressor
@A: input(Uint8Array)
@bs: block size(32768 - 0x7fffffff)
@done: call back of last process.
done(A,a,z)
@A: compressed array of input.
@a: input size
@z: compressed size
@rate: call back of progress
rate(a,z)
@a: current position
@z: last position
*/
function GRZ2MTFenc(A,bs,done=a=>a,rate){
function encode(t,f,c){
L+=c*(R=R/t>>>0);c=L>0xffffffff;
for(R*=f;R<16777216;R*=256,L=L<<8>>>0)
if(255>L>>>24||c){
for(O[o++]=255&c--+B,B=L>>>24;N;N--)O[o++]=255&c;c=0
}else++N
}
A instanceof Uint8Array||(A=new Uint8Array(A));
var E="function"!=typeof rate?0:4096,bp,cb=6,a=1024,o=256,x,y=1<<15,z=A.length,M=new Uint8Array(o),
R1=new Uint16Array(8192).fill(1024),R0=[],R2=[],S=new Uint8Array([1,1,1,1,4]),S0=[],S1=[],S2=R1.slice(-6),S3=[],S4=[],
L=0,R=-1>>>0,N=0,B=0,O=[],
c=0,c0,c1=0,cr,max=2048,wait=wait0;
if(!E)rate=a=>a,wait=a=>a();
bs&=-1>>>1;if(bs<y)bs=y;if(bs>z)bs=z;y=0;
// init models
for(;o--;)S0[M[o]=o]=new Uint8Array(5);
for(;a;)S1[--a]=new Uint8Array(5);
for(;a<8;S3[a++]=R1.slice(-6))S4[a]=R1.slice(-128);
for(a=64;a;)R0[--a]=R1.slice(-32);
for(a=24;a;)R2[--a]=R1.slice(-32);
// EndOfMark
function EOM(){
var V=S0[c],U=S1[c0<<2|cr&3],i=S[3]+U[3]+V[3],j=S[4]+U[4]+V[4],n;
encode(j,i,j-i),update03(S,V,U,3),
V=S2,U=S3[c1];
for(i=j=0;i<cb;)
encode(max,max-(n=V[i]+U[i]>>1),n),
V[i]-=V[i]>>4,U[i]-=U[i++]>>6;
for(V=S4[i];j<1<<cb;V[j]-=V[j]>>7)
encode(max,max-V[j+=++j],V[j])
}
function block(){
var b,e=E,i,j,l,n,m,p,r,d=Date.now(),U,V;
for(rate(a,z);a<y;){
if(!--e&&Date.now(e=E)-d>200)return wait(block);
V=S0[c],U=S1[c0<<2|cr&3],c=A[a];
for(l=1;A[++a]===c&&a<y;)l++;
for(r=0;M[r]^c;)r++;
if(i=r--){for(;i;)M[i]=M[--i];M[0]=c}
// encode rank
if((r&=255)<3){
for(n=0;i<r;)n+=U[i]+V[i]+S[i++];
encode(S[4]+U[4]+V[4],S[r]+U[r]+V[r],n),update03(S,V,U,r)}
else{
encode(j=S[4]+U[4]+V[4],i=S[3]+U[3]+V[3],j-i),update03(S,V,U,3),
V=S2,U=S3[c1],
m=R2M[r],n=c1=R2N[r],p=R2P[r];
for(i=0;i<n;)
encode(max,max-(j=V[i]+U[i]>>1),j),
V[i]-=V[i]>>4,U[i]-=U[i++]>>6;
if(n<cb)
encode(max,V[i]+U[i]>>1,0),
V[i]+=max-V[i]>>4,U[i]+=max-U[i]>>6;
V=S4[n];
for(j=1;m;m>>=1)
if(p&m)
encode(max,max-V[j],V[j]),
V[j]-=V[j]>>7,j+=j+1;
else
encode(max,V[j],0),
V[j]+=max-V[j]>>7,j+=j
}
if(r>3)r=3;
c0=c0<<2&255|r;
if(l<2)b=0;
else for(b=m=1;l>>b>1;b++)m<<=1;
// encode run length
V=R0[cr|r<<4],U=R1,j=c*32;
for(i=0;i<b;)
encode(max,max-(n=V[i]+U[j]>>1),n),
V[i]-=V[i++]>>6,U[j]-=U[j++]>>3;
encode(max,V[i]+U[j]>>1,0),
V[i]+=max-V[i]>>6,U[j]+=max-U[j]>>3,
V=R2[b],cr=cr<<1&15|b>1;
for(i=0;m;m>>=1)
if(l&m)encode(max,max-V[i],V[i]),
V[i]-=V[i++]>>6;
else encode(max,V[i],0),
V[i]+=max-V[i++]>>6
}
EOM();
// write pidx
for(i=c=c0=c1=cr=0;x>>++i;);
if(x>1)for(;i;)encode(max,1024,(bp>>--i&1)<<10);
return wait(head)
}
function head(){
if(a>=z){EOM();
for(L+=R>>>1,c=L>0xffffffff,x=6;--x;L=L<<8>>>0)
if(255>L>>>24||c){
for(O[o++]=255&c--+B,B=L>>>24;N;N--)O[o++]=255&c;c=0
}else++N;
done(O,z,o);
return O
}
var U=A.subarray(a,a+bs);y+=x=U.length;
bp=BWTe(U.slice(),U);
return wait(block)
}return head()
}
/* decompressor
@A: input (Uint8Array)
@done: call back of last process.
done(A,a,z)
@A: decompressed array of input.
@a: input size
@z: decompressed size
@rate: call back of progress.
rate(a,z)
@a: current position
@z: last position
*/
function GRZ2MTFdec(A,done=a=>a,rate){
function decode(f,c){
for(L-=R*c,R*=f;R<16777216;R*=256)L=(L<<8|A[a++])>>>0
}
function block(){
var b,e=E,f,i,l,n,r,U,V,d=Date.now();
for(rate(a,z);;){
if(!--e&&Date.now(e=E)-d>200)return wait(block);
V=S0[c],U=S1[t<<2|s&3],
b=L/(R=R/(S[4]+U[4]+V[4])>>>0)>>>0;
for(n=r=0;b>=n;)n+=S[r]+U[r]+V[r++];
decode(b=S[--r]+U[r]+V[r],n-b),
S[r]+=2,U[r]+=2,V[r]+=2;
if((S[4]+=2)>58)S[4]=(S[0]-=S[0]>>1)+(S[1]-=S[1]>>1)+(S[2]-=S[2]>>1)+(S[3]-=S[3]>>1);
if((V[4]+=2)>62)V[4]=(V[0]>>=1)+(V[1]>>=1)+(V[2]>>=1)+(V[3]>>=1);
if((U[4]+=2)>204)U[4]=(U[0]>>=1)+(U[1]>>=1)+(U[2]>>=1)+(U[3]>>=1);
if(r>2){// additional rank decoding
V=S2,U=S3[u];
for(i=r=0;i<cb;){
if(L/(R>>>=11)>>>0<(c=V[i]+U[i]>>1)){
decode(c,0);
V[i]+=2048-V[i]>>4;
U[i]+=2048-U[i]>>6;
break}
decode(2048-c,c);
V[i]-=V[i]>>4;
U[i]-=U[i++]>>6
}
V=S4[u=i],i=2<<i;
for(f=1;f<i;)
if(L/(R>>>=11)>>>0<(c=V[f]))
decode(c,0),
V[f]+=2048-c>>7,
f+=f,r+=r;
else
decode(2048-c,c),
V[f]-=c>>7,
f+=f+1,r+=r+1;
if(u===cb&&r>cs)break;
r+=2<<u|1
}
// move to front
for(n=r,c=M[i=++r&255];i;)M[i]=M[--i];M[i]=c;
if(n>3)n=3;
// decode run length
t=t<<2&255|n,V=R0[s|n<<4],U=R1;
for(f=c*32;L/(R>>>=11)>>>0>=(n=V[i]+U[f]>>1);U[f]-=U[f++]>>3)
decode(2048-n,n),V[i]-=V[i++]>>6;
decode(n,0),
V[i]+=2048-V[i]>>6,
U[f]+=2048-U[f]>>3;
V=R2[i],s=s<<1&15|i>1;
for(f=l=0;f<i;)
if(L/(R>>>=11)>>>0<(n=V[f]))
decode(n,0),
V[f++]+=2048-n>>6,l+=l;
else decode(2048-n,n),
V[f++]-=n>>6,l+=l+1;
for(l+=1<<f;l--;)P[p]=F[T[p++]=c]++
}
if(!p)return done(O,z,o);
for(i=r=f=0;r<p;r+=c)c=F[f],F[f++]=r;
for(c=0;p>>++c;);
if(p>1)for(;c--;)
if(L/(R>>>=11)>>>0<1024)decode(1024,0);
else decode(1024,1024),i|=1<<c;
for(c=0;p;c<i&&c++)c=P[c]+F[O[o+--p]=T[c]];
o+=r;F.fill(s=t=u=c=0);
return wait(block)
}
var E="function"!=typeof rate?0:4096,cb=6,cs=(2<<cb)-2,a=256,o=1024,p=0,z=A.length,M=[],P=[],O=[],T=[],F=new Uint32Array(a),
R1=new Uint16Array(8192).fill(1024),R0=[],R2=[],S=new Uint8Array([1,1,1,1,4]),S0=[],S1=[],S2=R1.slice(-6),S3=[],S4=[],
L,R=-1>>>0,c=0,s,t,u=0,wait=wait0;
if(!E)rate=a=>a,wait=a=>a();
// init models
for(;o;)S1[--o]=new Uint8Array(5);
for(;a;)S0[M[--a]=a]=new Uint8Array(5);
for(;a<8;S3[a++]=R1.slice(-6))S4[a]=R1.slice(-128);
for(a=64;a;)R0[--a]=R1.slice(-32);
for(a=24;a;)R2[--a]=R1.slice(-32);
for(;a<4;)L=(L<<8|A[a++])>>>0;
return block()
}
使用例
let A=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt());
let c=GRZ2MTFe(A,A.length),d=GRZ2MTFd(c)
console.log(c.length, String.fromCharCode(...d))
file爆縮実験
See the Pen GRZipII(mtf) compressor by xezz (@xezz) on CodePen.