RePairを拡張し、2語以上の共通部分(規則)を1語に置換しまくって圧縮するという企画。やっている事はMR-RePair
と同じですが、少し原理が違います。
文脈自由文法の構築にはB+木を利用。本家MR-RePairほど高速ではありませんが、規則の選択順の優位性から圧縮率は良くなる傾向があります。
犯行の手口
abcdabcd
という文字列をRePairで圧縮すると、以下のように規則3個とCC
という集合(仮)に変換できますが…
S = abcdabcd
A = ab
B = cd
C = AB
S' = CC
MR-RePairの場合、上記の規則AとBを併合して規則の数を減らします。
S = abcdabcd
C = abcd
S' = CC
その処理をするために以下の関数を使うわけです。
function growRule(c,rule,word){
for(word.len+=rule.len-1;word;word=word.n)
if(word.c===c){
let n=word.n;
word.c=rule.c;
for(word.n=rule.n;word=rule.n;)rule=word;
rule.n=n;break
}
delete Word[c]
}
growRule("A","ab","AB")
によりC
はabB
になり、growRule("B","cd","abB")
によりC
はabcd
となる感じです。実際には引数は数値と線形listで実現しています。
ちなみに本家MR-RePairはもっと賢く、省memoryかつ高速に規則を減らしています。
実装
B+木と文脈自由文法構築部
{const arraycopy=(A,a,B,b,n)=>{
if(n)if(A!==B||a+n<=b)for(;n--;)B[b++]=A[a++];
else for(A=A.slice(a,a+n),a=0;n--;)B[b++]=A[a++]
},
read32=(A,a)=>A[a]<<24|A[a+1]<<16|A[a+2]<<8|A[a+3],
read53=(A,a)=>A[a++]*281474976710656+A[a++]*1099511627776+A[a++]*4294967296+A[a++]*16777216+A[a++]*65536+A[a++]*256+A[a],
write32=(A,v,a)=>{A[a++]=v>>>24,A[a++]=255&v>>16,A[a++]=255&v>>8,A[a++]=255&v},
write53=(A,v,a)=>{
A[a++]=255&v/281474976710656,A[a++]=255&v/1099511627776,A[a++]=255&v/4294967296,
A[a++]=v>>>24,A[a++]=255&v>>16,A[a++]=255&v>>8,A[a++]=255&v
},
cmpKey=(a,o,b,l)=>{
for(var i=0,c;i<l;)if(c=a[o+i]-b[i++])return c;
return 0
},
binSearch=(A,n,seek)=>{
if(!n)return-1;
var z=n-1,l=seek.length,a=0,b=cmpKey(A,a,seek,l),c;
if(b>=0)return b&&-1;
c=cmpKey(A,z*l,seek,l);
if(c<1)return c?~n:z;
for(;a<=z;c>0?z=b-1:a=b+1){
b=a+(z-a>>>1),c=cmpKey(A,b*l,seek,l);
if(!c)return b
}return~a
},
hash32=a=>{
a=~(a/16777216)*~(a&16777215)%16777259;
return((a>>>16)*52503+(a&=65535)*1883<<16)+a*52503
};
class BplusTree{
constructor(fs){
this.root=new Leaf(this.fs=fs);
this.count=0
}
insert(k){
if(this.root.isOverflow()){
var bros=this.root.split(this),n=new Joint(this.fs);
n.joinSon(0,this.root,this.fs);
n.joinSon(1,bros,this.fs);
this.root=n
}this.root.insert(k,this)&&this.count++
}
kill(k){this.root.kill(k,this)&&--this.count}
pull(){return this.root.pull(this,--this.count)}
pop(){return this.root.pop(this,--this.count)}
}
class Joint{
constructor(fs){
this.son=Array(this.limit=512);this.kc=0
this.keys=new Uint8Array(this.limit*fs);
}
kill(k,t){
var i=this.getSon(k),c=this.son[i],r=c.kill(k,t);
arraycopy(c.keys,0,this.keys,i*t.fs,t.fs);
if(c.isUnderflow()){
this.killSon(i,t.fs);
if(t.root.kc==1&&t.root instanceof Joint)
t.root=t.root.son[0]
}return r
}
insert(k,t){
var i=this.getSon(k),c=this.son[i];
if(c.isOverflow()){
this.joinSon(i+1,c.split(t),t.fs);
if(cmpKey(this.keys,(i+1)*t.fs,k,k.length)<=0)
c=this.son[++i]
}
k=c.insert(k,t);
arraycopy(c.keys,0,this.keys,i*t.fs,t.fs);
return k
}
merge(n,fs){
arraycopy(n.keys,0,this.keys,this.kc*fs,n.kc*fs);
arraycopy(n.son,0,this.son,this.kc,n.kc);
this.kc+=n.kc
}
split(t){
var bros=new Joint(t.fs),l=this.kc>>1;
bros.kc=this.kc-l;
arraycopy(this.keys,l*t.fs,bros.keys,0,bros.kc*t.fs);
arraycopy(this.son,l,bros.son,0,bros.kc);
this.kc=l;
return bros
}
isOverflow(){return this.kc===this.limit}
isUnderflow(){return this.kc<this.limit>>1}
pull(t){
var c=this.son[0],r=c.pull(t);
arraycopy(c.keys,0,this.keys,0,t.fs);
if(c.isUnderflow()){
this.killSon(0,t.fs);
if(t.root.kc==1&&t.root instanceof Joint)
t.root=t.root.son[0]
}return r
}
pop(t){
var c=this.son[this.kc-1],r=c.pop(t);
if(!c.kc){
this.son[--this.kc]=null;
if(t.root.kc==1&&t.root instanceof Joint)
t.root=t.root.son[0]
}return r
}
getSon(k){return(k=binSearch(this.keys,this.kc,k))<0?~k?-k-2:~k:k}
killSon(p,fs){
var c=this.son[p];
if(p>0){
var b=this.son[p-1],hasMerged=(b.kc+c.kc)*fs<=b.keys.length;
hasMerged&&b.merge(c,fs)
}
if(!hasMerged&&p+1<this.kc){
b=this.son[p+1];
if((b.kc+c.kc)*fs<=c.keys.length)
c.merge(b,fs),p++,hasMerged=1
}
if(hasMerged){
arraycopy(this.keys,(p+1)*fs,this.keys,p*fs,(--this.kc-p)*fs);
arraycopy(this.son,p+1,this.son,p,this.kc-p);
this.son[this.kc]=null
}
}
joinSon(i,c,fs){
if(i<this.kc)
arraycopy(this.keys,i*fs,this.keys,(i+1)*fs,(this.kc-i)*fs),
arraycopy(this.son,i,this.son,i+1,this.kc-i);
arraycopy(c.keys,0,this.keys,i*fs,fs);
this.son[i]=c;
this.kc++
}}
class Leaf{
constructor(fs){
this.keys=new Uint8Array((this.limit=128)*fs);
this.kc=0
}
kill(k,t){
k=binSearch(this.keys,this.kc,k);
if(k>=0){
arraycopy(this.keys,~k*-t.fs,this.keys,k*t.fs,(--this.kc-k)*t.fs);
return 1
}
}
insert(k,t){
var p=binSearch(this.keys,this.kc,k);
if(p<0){
p=~p;p<this.kc++&&arraycopy(this.keys,p*t.fs,this.keys,~p*-t.fs,(this.kc+~p)*t.fs);
arraycopy(k,0,this.keys,p*t.fs,t.fs);
return 1
}
}
merge(bros,fs){
arraycopy(bros.keys,0,this.keys,this.kc*fs,bros.kc*fs);
this.kc+=bros.kc;
this.next=bros.next
}
split(t){
var bros=new Leaf(t.fs),l=this.kc>>1;
bros.kc=this.kc-l;
arraycopy(this.keys,l*t.fs,bros.keys,0,bros.kc*t.fs);
this.kc=l;
bros.next=this.next;
return this.next=bros
}
isOverflow(){return this.kc===this.limit}
isUnderflow(){return this.kc<this.limit>>1}
pull(t,r){
r=this.keys.slice(0,t.fs);
arraycopy(this.keys,t.fs,this.keys,0,--this.kc*t.fs);
return r
}
pop(t,r){return this.keys.subarray(r=--this.kc*t.fs,r+t.fs)}
}
///// hash
class HashTable{
constructor(bs){
this.dummy={w:-1};
this.values=Array(bs=1<<bs);
this.mask=bs-1
}
add(o){
for(var h=o.w,i=hash32(h),m=this.mask,p=this.values[i&m],d=this.dummy;p!==d&&p&&p.w-h;)p=this.values[++i&m];
if(p&&p!=d)return;
this.values[i&m]=o
}
get(h){
for(var i=hash32(h),m=this.mask,p=this.values[i&m],d=this.dummy;p===d||p&&p.w-h;)p=this.values[++i&m];
return p
}
kill(o){
for(var h=o.w,i=hash32(h),m=this.mask,p=this.values[i&m],d=this.dummy;p===d||p&&p.w-h;)p=this.values[++i&m];
if(p)this.values[i&m]=d
}}
class CNode{
constructor(s,i){
this.w=s;this.met=1;
this.hit=this.vary=0;
this.top=this.end=i
}
add(L,R,i){this.end=~this.end?R[L[i]=this.end]=i:this.top=i}
clear(L,R){
for(var i=this.top,n;i>-1;i=n)n=R[i],L[i]=R[i]=-1;
this.top=this.end=-1
}
cut(L,R,i){
var l=L[i],r=R[i];
if(l>-1)R[l]=r;
if(r>-1)L[r]=l;
L[i]=R[i]=-1;
if(i==this.top)this.top=r;
if(i==this.end)this.end=l
}}
function l2b(a,b,c){
c=(65535<a)<<4,c|=b=(255<(a>>>=c))<<3,c|=b=(15<(a>>=b))<<2;return(c|=b=(3<(a>>=b))<<1)|a>>b>>1
}
/*Context Free Grammar constructor
@size: input size
@low: minimum occurrence of common symbol pair = 2+@low
@grow: if !grow==false, grow rule
*/
function Pairing(size,low,grow){
this.sn=1;this.low=(low&3)+2;
var A=this.Hit=new Int32Array(size=this.limit=Math.min(256+size/this.low>>>0,1<<24));
this.Rc=grow||A.slice(),this.Lc=grow||A.slice(),this.wide=A.slice();
this.hasEndSymbol=new Int8Array(size)
}
Pairing.prototype={
map:function(A,eos,putList){
for(var a=A.length,b=0,c=a,n=257-!eos,B=new Int16Array(n),S=new Int32Array(eos?c+1:c);a--;)B[A[a]]=1;
putList&&putList(B);
for(B[256]=1;++a<n;)B[a]&&(this.wide[this.Lc[b]=B[a]=b++]=1);
for(this.cs=b,this.sn=S[eos?c:-1]=b-1;c;)S[--c]=B[A[c]];
return S
},
symbolize:async function(S,grow,rate=a=>a){
/*
1.Find most common symbol pair
2.Make it a new symbol
3.Repeat
@S: input(Array/Uint8Array)
@grow: if !grow==false, grow rule
@rate: callback of progress
@return: sequence size
*/
function growRule(c,rule,word){
for(word.len+=rule.len-1;word;word=word.n)
if(word.c===c){
let n=word.n;word.c=rule.c;
for(word.n=rule.n;word=rule.n;)rule=word;
rule.n=n;break
}
delete Word[c]
}
var t=+new Date,a,b=-1,c,e,f,i=0,k,l=S.length,m,n=l2b(l)-3,o,p,r,s=this.sn,d=s,u,v,w,x=l,mn=this.low-1,mx=this.limit,
fs2s=new HashTable(n<20?20:n),Bt=new BplusTree(12),
Lcode=this.Lc,Rcode=this.Rc,L=new Int32Array(l--).fill(b),R=L.slice(),
Word=[],met=[],E=this.hasEndSymbol,Hit=this.Hit,W=this.wide,h=r=>setTimeout(a=>r(rate(l-x,l)),0);
for(this.symbols=S;i<l;b=c-b?c:-1)
Hit[v=S[i++]]++,
this.addCode(c=v*16777216+S[i],i-1,c-b,L,R,fs2s,met);
for(Hit[S[l++]]++;this.update(L,R,fs2s,met,Bt),++s<mx&&Bt.count;){
w=fs2s.get(k=read53(Bt.pop(),4));
E[s]=E[v=w.w];
o=W[u=0|v/16777216];W[s]=o+W[v&=16777215];
fs2s.kill(w);
for(i=w.top,c=-1,f=0;i>-1;i=r)
if(r=R[i],~S[p=i]){
for(m=n=i+o;p--&&S[p]<0;);
for(~p&&this.cutCode(b=S[p]*16777216+S[i],p,L,R,fs2s,met);++n<l&&S[n]<0;);
if(n<l)
b=v*16777216+S[n],
b-k&&this.cutCode(b,m,L,R,fs2s,met);
w.cut(L,R,i);f++;
Hit[S[i]=s]++;S[m]=-1;
if(~p)
a=S[p],m=a*16777216+s,Hit[a]>mn&&!E[a]&&this.addCode(m,p,m-c,L,R,fs2s,met),
c=m-c&&a==s?m:-1;
n<l&&Hit[a=S[n]]>mn&&!E[s]&&this.addCode(s*16777216+a,i,1,L,R,fs2s,met)
}
Hit[u]-=f,Hit[v]-=f,x-=f;
if(grow){
e={c:u,n:{c:v},hit:f--,len:2};
if(u>d&&(Word[u].hit-=f)<2)growRule(u,Word[u],e);
if(v>d&&(Word[v].hit-=f)<2)growRule(v,Word[v],e);
Word[s]=e
}else Lcode[s]=u,Rcode[s]=v;
f<x*.01&&(s&63||Date.now()-t<200)||await new Promise(h,t=+new Date)
}
this.sn=s;this.D=grow&&Word;
return x
},
addCode:function(s,i,up,L,R,fs2s,met,n){
if(n=fs2s.get(s)){
if(!n.met)met.push(n),n.met=1;
n.add(L,R,i)
}else fs2s.add(n=new CNode(s,i)),met.push(n);
up&&n.vary++
},
cutCode:function(n,i,L,R,fs2s,met){
if(n=fs2s.get(n)){
if(!n.met)met.push(n),n.met=1;
n.vary--;n.cut(L,R,i)
}
},
update:function(L,R,fs2s,met,Bt){
var c,n,l=this.low,K=new Uint8Array(12);
for(n of met){
write32(K,n.hit,0);write53(K,n.w,4);
Bt.kill(K);
c=n.hit+=n.vary;n.vary=n.met=0;
if(c<l)n.clear(L,R),fs2s.kill(n);
else write32(K,c,0),Bt.insert(K)
}met.length=0
}}}
圧縮伸長部
/*compress
@In: input(Array / Uint8Array)
@done: call back of last process
done(A,a,z)
@A: compressed array of input
@a: input size
@z: compressed size
@rate: call back of progress
rate(a,z)
@a: current position
@z: last position
@return:
call with await: compressed array of @In
without await: Promise object
*/
async function compress(In,done,rate){
function puts(n,l){// bit writer
if(bits>l)return data|=n<<(bits-=l);
bits=32-(l-=bits);
Out[o++]=(data|=n>>l)>>>24;Out[o++]=data>>16&255;
Out[o++]=data>>8&255;Out[o++]=data&255;
if(bits<32)return data=n<<bits;data=0
}
function encodeCFG(Rule,Use,c){
for(var b,d=0,p=0,r,s,n,u,Stack=[];;c=Stack[--p])
if(Rule[u=Use[c]]>=0){// known rule/symbol
for(;codes>>log2;)log2++;
if(c>l)c=Rule[u];
// CBT code
if(c<(b=(1<<log2)-codes))s=log2-1;
else s=log2,c+=b;
b=0<d++;puts(b<<s|c,b+s);
if(!p)return
}else if(c>=0)// add rule/symbol to stack
for(s=P[c],Stack[p++]=n=s.len,Stack[p++]=~u,n=p+=n;Stack[--n]=s.c,s=s.n;);
else{// unknown rule
n=Stack[--p]-1;
if(d<3)puts(0,1);
else if(d<4)puts(n>1,2);
else b=s=l2b(n),(r=b>=l2b(d-1))||b++,puts(n^r<<b,++s*2-r);
d-=n;Rule[~c]=codes++;
if(!p)return
}
}
var c,x,l,n,o=0,size=In.length,codes=0,log2=0,bits=32,data,
Out=[],P=new Pairing(size,0,1),S,Rule=[],Use=[];
if(!size)return done(Out,size,o);
x=await P.symbolize(S=P.map(In,0,function(A,m,n,l,o,p,r,s){
// write symbols by RLE+gamma code
for(puts(A[l=n=o=0],1);n=l<256;puts(n,r)){
for(m=A[l],p=l2b(256-l);m&&o++,m==A[++l]&&l<256;n++);
r=s=l2b(m=+n);r<p||(n^=1<<r--);r-=m<256?~s:-1
}
for(codes=o;Rule[--o]=Use[o]=o;);
}),1,rate);
P=P.D;l=P.length;
// mapping
for(c=n=codes;n<l;n++)P[n]&&(Use[n]=c++);
puts(l=l2b(x)+1,5);puts(x^1<<--l,l);
// encoding loop
for(l=codes-1,n=0;;){
c=S[n++];
if(~c){
encodeCFG(Rule,Use,c);puts(0,1);
if(!--x)break
}
}
// incompressible
if(o+4>size){
Out.length=c=n=o=data=0;
for(bits=32;size>>++c;);
puts(c,14);puts(size^1<<--c,c);
while(n<size)puts(In[n++],8)
}
if(bits<32)for(;data;data<<=8)Out[o++]=data>>>24;
done&&done(Out,size,o);
return Out
}
/*decompress
@In: input(Array / Uint8Array)
@done: call back of last process
done(A,a,z)
@A: decompressed array of @In
@a: last position
@z: decompressed size
@rate: call back of progress
rate(a,z)
@a: current position
@z: last position
@return:
call with await: decompressed array of @In
without await: Promise object
*/
async function decompress(In,done,rate){
function gets(l,n){
n=data>>>32-l;
if(l<bits)return l&&(data<<=l,bits-=l,n);
l-=bits;bits=32-l;
data=In[a++]<<24|In[a++]<<16|In[a++]<<8|In[a++];
if(l)n|=data>>>bits,data<<=l;
return n
}
var a=0, o=0, u=0, bits=0, data, rule, codes=0, log2=0, loop;
const size=In.length, x=rate?32767:-1, fn=b=>setTimeout(c=>b(rate(a,size)),0),
Out=[], Rule=[], Stack=[], Word=[], T=[];
if(!size)return done(Out,size,o);
// read symbols
for(let e=0,f=gets(1);e<256;f^=1){
let c=0,d=l2b(256-e);
for(;c<d&&!gets(1);)++c;
for(c=c<8?1<<c|gets(c):256;c--;e++)if(f)T[codes++]=e
}
loop=gets(5);loop=loop&&gets(--loop)|1<<loop;
// decoding loop
if(rule=codes)for(;loop;loop--&x||await new Promise(fn))
for(let s=0;;)
if(!s||gets(1)){// known rule/symbol
for(;codes>>log2;)log2++;
// CBT code
let c=gets(log2-1),d=(1<<log2)-codes;
if(c>=d)c+=c+gets(1)-d;
// expand rule
for(Stack[s++]=c;;c=Word[--u])
if(c<rule){Out[o++]=T[c];if(!u)break}
else for(d=Rule[c],c=d.length;c;)Word[u++]=d[--c]
}else{// unknown rule
if(s<2)break;
let c=1,d;
if(s===3)c+=gets(1);
else if(s>3){
for(c=0,d=l2b(s-1);c<d&&!gets(1);)c++;
c=1<<c|gets(c)
}
for(d=Rule[codes]=[];d[c]=Stack[--s],c--;);
Stack[s++]=codes++
}else for(;o<loop;)Out[o++]=gets(8);
done&&done(Out,size,o);
return Out
}
文脈自由文法の作り方
//RePair風
(async()=>{
var A=[1,2,3,11,22,33,11,22,33,1,2,3],
p=new Pairing(A.length,0,false),
output=p.map(A),
size=await p.symbolize(output,false)
})()
//MR-RePair風
(async()=>{
var A=[1,2,3,11,22,33,11,22,33,1,2,3],
p=new Pairing(A.length,0,true),
output=p.map(A),
size=await p.symbolize(output,true)
})()
圧縮と伸長の例
(async()=>{
let A=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt());
let rate=(a,b)=>{console.log((a/b*1e4|0)/100,"%")}, done=(a,b,c)=>{console.log(b,"->",c,a)};
let e=await compress(A,done,rate), d=await decompress(e,done,rate);
console.log(e.length, String.fromCharCode(...d))
})()
codepenでfile処理。BTReePairなどと名乗ってやがります
See the Pen BTReePair by xezz (@xezz) on CodePen.
参考資料
TReePair:昔書いたごみ。C版はJavaScript版より可読性良いかも。MR-RePairを知る前にRePairを強引に書き換えて、規則拡張機能搭載した代物