圧縮の時間です。byte単位の文脈混合法に二次記号推定(Secondary Symbol Estimation)を組み合わせていきます。
原理
PPMに似ています。文脈木を構築し、設定した次数までの文脈に現れる各記号の頻度を足し合わせます(条件によってその倍率を変えます)。
そして最高頻度の記号が符号化したい記号と一致するかどうか、1 bit算術符号化。不一致なら記号頻度と合計頻度を調整して、追加の算術符号化。いわゆるSSE。
計算量が莫大なので非常に低速。はっきり言って使い物にならない程度に便利な役立たずです。
実装編
// for context tree
function Leaf(s,a){this.s=s,this.n=a<<1|1}
Leaf.prototype={c:1,e:0,o:0};
//model table
const CNUM=256,SCALE=1<<22,MX=8,MY=10,MET=999999,
ETT=new Int32Array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,550,0,MET,MET,MET,MET,MET,MET,110,0,MET,MET,MET,MET,MET,MET,300,150,MET,MET,MET,MET,MET,MET,330,130,0,MET,MET,MET,MET,MET,700,180,0,MET,MET,MET,MET,MET,750,190,0,MET,MET,MET,MET,MET,2600,160,0,MET,MET,MET,MET,MET]),
EWW=new Float32Array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,.15,1,0,0,0,0,0,3,.44,3.8,7,0,0,0,0,7,1.75,99,7,7,0,0,0,14,3.5,.7,99,7,15,0,0,16,4,.8,99,99,7,99,0,25,4.5,.75,99,99,7,99,99,99,4.5,.75,99,99,99,99,99]),
EWW1=new Float32Array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,6,0,0,0,0,0,3,1,3,6,0,0,0,0,7,2,2,4,6,0,0,0,14,4,2,4,4,8,0,0,16,6,2,4,4,4,8,0,25,6,2,4,4,4,4,12,35,6,2,4,4,4,4,12]),
OWW=new Float32Array([0,0,0,0,0,0,0,0,.28,0,0,0,0,0,0,0,.28,.28,0,0,0,0,0,0,1,.2,.25,0,0,0,0,0,1,.2,.25,1/3.5,0,0,0,0,0,0,.15,.44,.44,0,0,0,0,0,0,.15,.3,.5,0,0,0,0,0,0,.15,.5,.15,0,0,0,0,0,.15,.5,.15,.15,.15,.15,.15,.15,.15,.15,.15,.15]),
OWW1=new Float32Array([0,0,0,0,0,0,0,0,.28,0,0,0,0,0,0,0,.28,.28,0,0,0,0,0,0,1,.2,.25,0,0,0,0,0,1,.2,.25,1/3.5,0,0,0,0,.4,.4,.4,.4,.45,0,0,0,.5,.5,.5,.5,.5,.4,0,0,.5,.5,.5,.5,.5,.3,.2,0,.5,.5,.5,.5,.5,.5,.2,.3,.5,.5,.5,.5,999,.5,.3,.25]);
/* compressor
@A :input(Array / Uint8Array)
@mo :context order(0-1023)
@ml :limit of total nodes=4e9/ml(@ml:0-4095)
@done: call back of last process.
done(A,a,z)
@A: (de)compressed array of input
@a: last position
@z: (de)compressed size
@rate: call back of progress
rate(a,z)
@a: current position
@z: last position
@return:
call with await: compressed array of @A
without await: Promise object
*/
async function PPMYenc(A,mo,ml,done,rate){
function enc(c,f,t,n){
L+=n=c*R/t+1>>>0;
for(R=R*(c+f)/t-n>>>0;R<16777216;R*=256,L=L<<8>>>0)
if((f=L>0xffffffff)||255>L>>>24)
for(O[o++]=255&f--+B,B=L>>>24;C;C--)O[o++]=255&f;
else++C
}
mo&=1023;ml&=4095;
var b,c=A.length,e,f,i=128,j,p,t,u,w,z=c,
L=0,R=-1>>>0,B,C=0,o=2,O=[mo&255,mo>>8|ml<<2&255,ml>>6], // coder
F=new Float64Array(CNUM),sseLastHit=0,SSE=[],
root={c:1,e:0,n:0,o:0},x=new Leaf(0,0),Cx=[root], // context tree
m=1,n,OMOpt=1,
OMTotal=CNUM,OMProb,ix=rate?4095:-1,fn=b=>wait0(c=>b(rate(i,z))),st=+new Date;
for(mo+=2;O[++o]=B=c&255,c>>>=8;)O[2]+=64; // write size
for(ml=4e9/ml;i;)for(p=SSE[--i]=[],j=8;j;)p[--j]=64|i<<7;
for(;i<z;i&ix||Date.now()-st<200||await new Promise(fn,st=+new Date)){
if(m>mo)m=mo;c=A[i++];
//mix in order-(-1)
OMProb=.25/OMTotal*OMOpt/i;
for(j=CNUM;j;)F[--j]=OMProb;
//mix context probabilities
for(n=0;n<m;){
p=Cx[n];b=p.n;
if(!b.c&&b&1)b=p.n=new Leaf(A[b>>=1],b+1),p.e++,ml--;
t=p.c-1;u=1/t;
e=p.e,j=Math.min(m,MY-1)*MX+Math.min(n++,MX-1);
f=e^1?OWW[j]:OWW1[j];
if(m<3)e=1;
else{
w=e^1?EWW[j]:EWW1[j];j=ETT[j];
e=1+w*(t<j?t-e:j-e)*u;
if(e<0)e=0;e/=1+w
}
for(e*=(f+p.o)/(f+t)*u;b;b=b.p)F[b.s]+=b.c*e
}
//count inverse total for renormalization
for(j=CNUM,t=u=w=0;j--;)u+=F[j];
var maxF=j,maxS;
for(u=u>0?SCALE/u:0;++j<CNUM;t+=f){
f=F[j]*u+1|0;
j^c||(e=f,w=t);
if(f>maxF)maxF=f,maxS=j;
}
p=SSE[4096*maxF/t>>5];
f=p[j=n>1?(Cx[j=n-1].n.s===maxS)<<1|(Cx[j].o*2+2>Cx[j].c)<<2|sseLastHit:0];
if(c==maxS)
enc(0,f,0x4000),
p[j]+=0x3fff-f>>8,sseLastHit=1;
else{
enc(f,0x4000-f,0x4000);
p[j]-=f>>8,sseLastHit=0;
t-=maxF;
if(c>maxS)w-=maxF;
enc(w,e,t)
}
//detect which context has the biggest probability for current sample
for(f=OMProb,e=-1;n;){
p=Cx[--n];t=p.c;
u=t>1?1/--t:0;
for(p=p.n;p;p=p.p)
if(p.s===c){
p=p.c*u;
if(p>f)f=p,e=n;
break
}
}
//increment optimality frequency
~e?Cx[e].o++:++OMOpt;
up:for(j=++m;--j;Cx[j]=p){
for(e=0,n=Cx[j-1],p=n.n;p;e=p,p=p.p)
if(p.s===c){
p.c++,f=p.p,e?e.p=f:n.n=f,p.p=n.n,n.n=p;
continue up
}
if(ml>0)
e=n.n,n.n=p=new Leaf(c,i),n.e++,ml--,p.p=e;
else p=x;m=j
}root.c++ // order 0
}
for(i=5;i--;L=L<<8>>>0)
if((f=L>0xffffffff)||L>>>24<255)
for(O[o++]=255&f--+B,B=L>>>24;C;C--)O[o++]=255&f;
else++C;
done&&done(O,z,o);return O
}
/* decompressor
@A :input(Array / Uint8Array)
@done: call back of last process.
done(A,a,z)
@A: (de)compressed array of input
@a: last position
@z: (de)compressed size
@rate: call back of progress
rate(a,z)
@a: current position
@z: last position
@return:
call with await: decompressed array of @A
without await: Promise object
*/
async function PPMYdec(A,done,rate){
function dec(c,f,t,n){L-=n=c*R/t+1>>>0;
for(R=R*(c+f)/t-n>>>0;R<16777216;R*=256)L=(L<<8|A[a++])>>>0
}
var a=3,b,c=A[2],e,f,i=128,j=1,p=A[1],t,u,v,w,y=A.length,z=0,
L,R=-1>>>0,O=[], // coder
sseLastHit=0,SSE=[],
root={c:1,e:0,n:0,o:0},x=new Leaf(0,0),Cx=[root],// context tree
F=new Float64Array(CNUM),C=new Uint32Array(CNUM),
m=1,n,mo=(A[0]|p<<8&1023)+2,ml=4e9/(p>>2&255|c<<6&4095),OMOpt=1, // int
OMTotal=CNUM,OMProb,ix=rate?4095:-1,fn=b=>wait0(c=>b(rate(a,y))),st=+new Date;
for(c>>=6;z+=A[a++]*j,c--;)j*=256;
for(j=5;--j;)L=(L<<8|A[a++])>>>0;
for(;i;)for(p=SSE[--i]=[],j=8;j;)p[--j]=64|i<<7;
for(;i<z;i&ix||Date.now()-st<200||await new Promise(fn,st=+new Date)){
if(m>mo)m=mo;
OMProb=.25/OMTotal*OMOpt/-~i;
for(j=CNUM;j;)F[--j]=OMProb;
for(n=0;n<m;){
p=Cx[n];b=p.n;
if(!b.c&&b&1)b=p.n=new Leaf(O[b>>=1],b+1),p.e++,ml--;
t=p.c-1;u=1/t;
e=p.e,j=Math.min(m,MY-1)*MX+Math.min(n++,MX-1);
f=e^1?OWW[j]:OWW1[j];
if(m<3)e=1;
else{
w=e^1?EWW[j]:EWW1[j];j=ETT[j];
e=1+w*(t<j?t-e:j-e)*u;
if(e<0)e=0;e/=1+w
}
for(e*=(f+p.o)/(f+t)*u;b;b=b.p)F[b.s]+=b.c*e
}
for(j=CNUM,t=u=0;j--;)u+=F[j];
var maxF=j,maxS;
for(u=u>0?SCALE/u:0;++j<CNUM;){
t+=C[j]=p=F[j]*u+1|0;
if(p>maxF)maxF=p,maxS=j
}
p=SSE[4096*maxF/t>>5];
f=p[j=n>1?(Cx[j=n-1].n.s===maxS)<<1|(Cx[j].o*2+2>Cx[j].c)<<2|sseLastHit:0];
if(L*0x4000/R>>>0<f)
dec(0,f,0x4000),
p[j]+=0x3fff-f>>8,
sseLastHit=1,c=maxS;
else{
dec(f,0x4000-f,0x4000);
p[j]-=f>>8,sseLastHit=c=w=0,t-=maxF;
for(v=L*t/R>>>0;c<CNUM;c++)if(c^maxS){
if(v<w+C[c])break;
w+=C[c]
}
dec(w,C[c],t)
}
for(f=OMProb,e=-1;n;){
p=Cx[--n];t=p.c;
u=t>1?1/--t:0;
for(p=p.n;p;p=p.p)
if(p.s===c){
p=p.c*u;
if(p>f)f=p,e=n;
break
}
}
~e?Cx[e].o++:++OMOpt;O[i++]=c;
up:for(j=++m;--j;Cx[j]=p){
for(e=0,n=Cx[j-1],p=n.n;p;e=p,p=p.p)
if(p.s===c){
p.c++,f=p.p,e?e.p=f:n.n=f,p.p=n.n,n.n=p;
continue up
}
if(ml>0)
e=n.n,n.n=p=new Leaf(c,i),n.e++,ml--,p.p=e;
else p=x;m=j
}root.c++
}done&&done(O,y,i);return O
}
PPMYencの引数の説明は上記にも書いてある通り。@A
は圧縮元配列、@mo
は最大次数(0-1023)、@ml
は作成できる文脈木の葉の最大数(0-4095)で、小さい方が多くなる天邪鬼な性格、@done
は処理終了後のcallback関数、@rate
は処理途中のcallback関数。
PPMYdecの@A
は圧縮配列、他はPPMYenc同様。
使用例
(async()=>{
let A=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt());
let rate=(a,b)=>{console.log((a/b*1e4|0)/100,"%")}, done=(a,b,c)=>{console.log(b,"->",c,a)};
let e=await PPMYenc(A,14,111,done,rate), d=await PPMYdec(e,done,rate);
console.log(e.length, String.fromCharCode(...d))
// call without await
PPMYenc(A,14,111,done,rate).then(a=>{
PPMYdec(a,done,rate).then(b=>{
console.log(b.length,"->",a.length, String.fromCharCode(...b))
})
})
})()
codepen様到来
See the Pen PPMY compressor by xezz (@xezz) on CodePen.