0
0

RePairを拡張し、2語以上の共通部分(規則)を1語に置換しまくって圧縮するという企画。やっている事はMR-RePairと同じですが、少し原理が違います。
文脈自由文法の構築にはB+木を利用。本家MR-RePairほど高速ではありませんが、規則の選択順の優位性から圧縮率は良くなる傾向があります。

犯行の手口

abcdabcdという文字列をRePairで圧縮すると、以下のように規則3個とCCという集合(仮)に変換できますが…

S = abcdabcd
A = ab
B = cd
C = AB
S' = CC

MR-RePairの場合、上記の規則AとBを併合して規則の数を減らします。

S = abcdabcd
C = abcd
S' = CC

その処理をするために以下の関数を使うわけです。

function growRule(c,rule,word){
	for(word.len+=rule.len-1;word;word=word.n)
		if(word.c===c){
			let n=word.n;
			word.c=rule.c;
			for(word.n=rule.n;word=rule.n;)rule=word;
			rule.n=n;break
		}
	delete Word[c]
}

growRule("A","ab","AB")によりCabBになり、growRule("B","cd","abB")によりCabcdとなる感じです。実際には引数は数値と線形listで実現しています。
ちなみに本家MR-RePairはもっと賢く、省memoryかつ高速に規則を減らしています。

実装

B+木と文脈自由文法構築部

{const arraycopy=(A,a,B,b,n)=>{
	if(n)if(A!==B||a+n<=b)for(;n--;)B[b++]=A[a++];
	else for(A=A.slice(a,a+n),a=0;n--;)B[b++]=A[a++]
},
read32=(A,a)=>A[a]<<24|A[a+1]<<16|A[a+2]<<8|A[a+3],
read53=(A,a)=>A[a++]*281474976710656+A[a++]*1099511627776+A[a++]*4294967296+A[a++]*16777216+A[a++]*65536+A[a++]*256+A[a],
write32=(A,v,a)=>{A[a++]=v>>>24,A[a++]=255&v>>16,A[a++]=255&v>>8,A[a++]=255&v},
write53=(A,v,a)=>{
	A[a++]=255&v/281474976710656,A[a++]=255&v/1099511627776,A[a++]=255&v/4294967296,
	A[a++]=v>>>24,A[a++]=255&v>>16,A[a++]=255&v>>8,A[a++]=255&v
},
cmpKey=(a,o,b,l)=>{
	for(var i=0,c;i<l;)if(c=a[o+i]-b[i++])return c;
	return 0
},
binSearch=(A,n,seek)=>{
	if(!n)return-1;
	var z=n-1,l=seek.length,a=0,b=cmpKey(A,a,seek,l),c;
	if(b>=0)return b&&-1;
	c=cmpKey(A,z*l,seek,l);
	if(c<1)return c?~n:z;
	for(;a<=z;c>0?z=b-1:a=b+1){
		b=a+(z-a>>>1),c=cmpKey(A,b*l,seek,l);
		if(!c)return b
	}return~a
},
hash32=a=>{
	a=~(a/16777216)*~(a&16777215)%16777259;
	return((a>>>16)*52503+(a&=65535)*1883<<16)+a*52503
};
class BplusTree{
constructor(fs){
	this.root=new Leaf(this.fs=fs);
	this.count=0
}
insert(k){
	if(this.root.isOverflow()){
		var bros=this.root.split(this),n=new Joint(this.fs);
		n.joinSon(0,this.root,this.fs);
		n.joinSon(1,bros,this.fs);
		this.root=n
	}this.root.insert(k,this)&&this.count++
}
kill(k){this.root.kill(k,this)&&--this.count}
pull(){return this.root.pull(this,--this.count)}
pop(){return this.root.pop(this,--this.count)}
}
class Joint{
constructor(fs){
	this.son=Array(this.limit=512);this.kc=0
	this.keys=new Uint8Array(this.limit*fs);
}
kill(k,t){
	var i=this.getSon(k),c=this.son[i],r=c.kill(k,t);
	arraycopy(c.keys,0,this.keys,i*t.fs,t.fs);
	if(c.isUnderflow()){
		this.killSon(i,t.fs);
		if(t.root.kc==1&&t.root instanceof Joint)
			t.root=t.root.son[0]
	}return r
}
insert(k,t){
	var i=this.getSon(k),c=this.son[i];
	if(c.isOverflow()){
		this.joinSon(i+1,c.split(t),t.fs);
		if(cmpKey(this.keys,(i+1)*t.fs,k,k.length)<=0)
			c=this.son[++i]
	}
	k=c.insert(k,t);
	arraycopy(c.keys,0,this.keys,i*t.fs,t.fs);
	return k
}
merge(n,fs){
	arraycopy(n.keys,0,this.keys,this.kc*fs,n.kc*fs);
	arraycopy(n.son,0,this.son,this.kc,n.kc);
	this.kc+=n.kc
}
split(t){
	var bros=new Joint(t.fs),l=this.kc>>1;
	bros.kc=this.kc-l;
	arraycopy(this.keys,l*t.fs,bros.keys,0,bros.kc*t.fs);
	arraycopy(this.son,l,bros.son,0,bros.kc);
	this.kc=l;
	return bros
}
isOverflow(){return this.kc===this.limit}
isUnderflow(){return this.kc<this.limit>>1}
pull(t){
	var c=this.son[0],r=c.pull(t);
	arraycopy(c.keys,0,this.keys,0,t.fs);
	if(c.isUnderflow()){
		this.killSon(0,t.fs);
		if(t.root.kc==1&&t.root instanceof Joint)
			t.root=t.root.son[0]
	}return r
}
pop(t){
	var c=this.son[this.kc-1],r=c.pop(t);
	if(!c.kc){
		this.son[--this.kc]=null;
		if(t.root.kc==1&&t.root instanceof Joint)
			t.root=t.root.son[0]
	}return r
}
getSon(k){return(k=binSearch(this.keys,this.kc,k))<0?~k?-k-2:~k:k}
killSon(p,fs){
	var c=this.son[p];
	if(p>0){
		var b=this.son[p-1],hasMerged=(b.kc+c.kc)*fs<=b.keys.length;
		hasMerged&&b.merge(c,fs)
	}
	if(!hasMerged&&p+1<this.kc){
		b=this.son[p+1];
		if((b.kc+c.kc)*fs<=c.keys.length)
			c.merge(b,fs),p++,hasMerged=1
	}
	if(hasMerged){
		arraycopy(this.keys,(p+1)*fs,this.keys,p*fs,(--this.kc-p)*fs);
		arraycopy(this.son,p+1,this.son,p,this.kc-p);
		this.son[this.kc]=null
	}
}
joinSon(i,c,fs){
	if(i<this.kc)
		arraycopy(this.keys,i*fs,this.keys,(i+1)*fs,(this.kc-i)*fs),
		arraycopy(this.son,i,this.son,i+1,this.kc-i);
	arraycopy(c.keys,0,this.keys,i*fs,fs);
	this.son[i]=c;
	this.kc++
}}
class Leaf{
constructor(fs){
	this.keys=new Uint8Array((this.limit=128)*fs);
	this.kc=0
}
kill(k,t){
	k=binSearch(this.keys,this.kc,k);
	if(k>=0){
		arraycopy(this.keys,~k*-t.fs,this.keys,k*t.fs,(--this.kc-k)*t.fs);
		return 1
	}
}
insert(k,t){
	var p=binSearch(this.keys,this.kc,k);
	if(p<0){
		p=~p;p<this.kc++&&arraycopy(this.keys,p*t.fs,this.keys,~p*-t.fs,(this.kc+~p)*t.fs);
		arraycopy(k,0,this.keys,p*t.fs,t.fs);
		return 1
	}
}
merge(bros,fs){
	arraycopy(bros.keys,0,this.keys,this.kc*fs,bros.kc*fs);
	this.kc+=bros.kc;
	this.next=bros.next
}
split(t){
	var bros=new Leaf(t.fs),l=this.kc>>1;
	bros.kc=this.kc-l;
	arraycopy(this.keys,l*t.fs,bros.keys,0,bros.kc*t.fs);
	this.kc=l;
	bros.next=this.next;
	return this.next=bros
}
isOverflow(){return this.kc===this.limit}
isUnderflow(){return this.kc<this.limit>>1}
pull(t,r){
	r=this.keys.slice(0,t.fs);
	arraycopy(this.keys,t.fs,this.keys,0,--this.kc*t.fs);
	return r
}
pop(t,r){return this.keys.subarray(r=--this.kc*t.fs,r+t.fs)}
}
///// hash
class HashTable{
constructor(bs){
	this.dummy={w:-1};
	this.values=Array(bs=1<<bs);
	this.mask=bs-1
}
add(o){
	for(var h=o.w,i=hash32(h),m=this.mask,p=this.values[i&m],d=this.dummy;p!==d&&p&&p.w-h;)p=this.values[++i&m];
	if(p&&p!=d)return;
	this.values[i&m]=o
}
get(h){
	for(var i=hash32(h),m=this.mask,p=this.values[i&m],d=this.dummy;p===d||p&&p.w-h;)p=this.values[++i&m];
	return p
}
kill(o){
	for(var h=o.w,i=hash32(h),m=this.mask,p=this.values[i&m],d=this.dummy;p===d||p&&p.w-h;)p=this.values[++i&m];
	if(p)this.values[i&m]=d
}}
class CNode{
constructor(s,i){
	this.w=s;this.met=1;
	this.hit=this.vary=0;
	this.top=this.end=i
}
add(L,R,i){this.end=~this.end?R[L[i]=this.end]=i:this.top=i}

clear(L,R){
	for(var i=this.top,n;i>-1;i=n)n=R[i],L[i]=R[i]=-1;
	this.top=this.end=-1
}
cut(L,R,i){
	var l=L[i],r=R[i];
	if(l>-1)R[l]=r;
	if(r>-1)L[r]=l;
	L[i]=R[i]=-1;
	if(i==this.top)this.top=r;
	if(i==this.end)this.end=l
}}
function l2b(a,b,c){
	c=(65535<a)<<4,c|=b=(255<(a>>>=c))<<3,c|=b=(15<(a>>=b))<<2;return(c|=b=(3<(a>>=b))<<1)|a>>b>>1
}
/*Context Free Grammar constructor
@size: input size
@low: minimum occurrence of common symbol pair = 2+@low
@grow: if !grow==false, grow rule
*/
function Pairing(size,low,grow){
	this.sn=1;this.low=(low&3)+2;
	var A=this.Hit=new Int32Array(size=this.limit=Math.min(256+size/this.low>>>0,1<<24));
	this.Rc=grow||A.slice(),this.Lc=grow||A.slice(),this.wide=A.slice();
	this.hasEndSymbol=new Int8Array(size)
}
Pairing.prototype={
map:function(A,eos,putList){
	for(var a=A.length,b=0,c=a,n=257-!eos,B=new Int16Array(n),S=new Int32Array(eos?c+1:c);a--;)B[A[a]]=1;
	putList&&putList(B);
	for(B[256]=1;++a<n;)B[a]&&(this.wide[this.Lc[b]=B[a]=b++]=1);
	for(this.cs=b,this.sn=S[eos?c:-1]=b-1;c;)S[--c]=B[A[c]];
	return S
},
symbolize:async function(S,grow,rate=a=>a){
/*
1.Find most common symbol pair
2.Make it a new symbol
3.Repeat

@S: input(Array/Uint8Array)
@grow: if !grow==false, grow rule
@rate: callback of progress
@return: sequence size
*/
	function growRule(c,rule,word){
		for(word.len+=rule.len-1;word;word=word.n)
			if(word.c===c){
				let n=word.n;word.c=rule.c;
				for(word.n=rule.n;word=rule.n;)rule=word;
				rule.n=n;break
			}
		delete Word[c]
	}
	var t=+new Date,a,b=-1,c,e,f,i=0,k,l=S.length,m,n=l2b(l)-3,o,p,r,s=this.sn,d=s,u,v,w,x=l,mn=this.low-1,mx=this.limit,
		fs2s=new HashTable(n<20?20:n),Bt=new BplusTree(12),
		Lcode=this.Lc,Rcode=this.Rc,L=new Int32Array(l--).fill(b),R=L.slice(),
		Word=[],met=[],E=this.hasEndSymbol,Hit=this.Hit,W=this.wide,h=r=>setTimeout(a=>r(rate(l-x,l)),0);
	for(this.symbols=S;i<l;b=c-b?c:-1)
		Hit[v=S[i++]]++,
		this.addCode(c=v*16777216+S[i],i-1,c-b,L,R,fs2s,met);
	for(Hit[S[l++]]++;this.update(L,R,fs2s,met,Bt),++s<mx&&Bt.count;){
		w=fs2s.get(k=read53(Bt.pop(),4));
		E[s]=E[v=w.w];
		o=W[u=0|v/16777216];W[s]=o+W[v&=16777215];
		fs2s.kill(w);
		for(i=w.top,c=-1,f=0;i>-1;i=r)
			if(r=R[i],~S[p=i]){
				for(m=n=i+o;p--&&S[p]<0;);
				for(~p&&this.cutCode(b=S[p]*16777216+S[i],p,L,R,fs2s,met);++n<l&&S[n]<0;);
				if(n<l)
					b=v*16777216+S[n],
					b-k&&this.cutCode(b,m,L,R,fs2s,met);
				w.cut(L,R,i);f++;
				Hit[S[i]=s]++;S[m]=-1;
				if(~p)
					a=S[p],m=a*16777216+s,Hit[a]>mn&&!E[a]&&this.addCode(m,p,m-c,L,R,fs2s,met),
					c=m-c&&a==s?m:-1;
				n<l&&Hit[a=S[n]]>mn&&!E[s]&&this.addCode(s*16777216+a,i,1,L,R,fs2s,met)
			}
		Hit[u]-=f,Hit[v]-=f,x-=f;
		if(grow){
			e={c:u,n:{c:v},hit:f--,len:2};
			if(u>d&&(Word[u].hit-=f)<2)growRule(u,Word[u],e);
			if(v>d&&(Word[v].hit-=f)<2)growRule(v,Word[v],e);
			Word[s]=e
		}else Lcode[s]=u,Rcode[s]=v;
		f<x*.01&&(s&63||Date.now()-t<200)||await new Promise(h,t=+new Date)
	}
	this.sn=s;this.D=grow&&Word;
	return x
},
addCode:function(s,i,up,L,R,fs2s,met,n){
	if(n=fs2s.get(s)){
		if(!n.met)met.push(n),n.met=1;
		n.add(L,R,i)
	}else fs2s.add(n=new CNode(s,i)),met.push(n);
	up&&n.vary++
},
cutCode:function(n,i,L,R,fs2s,met){
	if(n=fs2s.get(n)){
		if(!n.met)met.push(n),n.met=1;
		n.vary--;n.cut(L,R,i)
	}
},
update:function(L,R,fs2s,met,Bt){
	var c,n,l=this.low,K=new Uint8Array(12);
	for(n of met){
		write32(K,n.hit,0);write53(K,n.w,4);
		Bt.kill(K);
		c=n.hit+=n.vary;n.vary=n.met=0;
		if(c<l)n.clear(L,R),fs2s.kill(n);
		else write32(K,c,0),Bt.insert(K)
	}met.length=0
}}}

圧縮伸長部

/*compress
@In: input(Array / Uint8Array)
@done: call back of last process
	done(A,a,z)
	@A: compressed array of input
	@a: input size
	@z: compressed size
@rate: call back of progress
	rate(a,z)
	@a: current position
	@z: last position
@return:
	call with await: compressed array of @In
	without await: Promise object
*/
async function compress(In,done,rate){
	function puts(n,l){// bit writer
		if(bits>l)return data|=n<<(bits-=l);
		bits=32-(l-=bits);
		Out[o++]=(data|=n>>l)>>>24;Out[o++]=data>>16&255;
		Out[o++]=data>>8&255;Out[o++]=data&255;
		if(bits<32)return data=n<<bits;data=0
	}
	function encodeCFG(Rule,Use,c){
		for(var b,d=0,p=0,r,s,n,u,Stack=[];;c=Stack[--p])
			if(Rule[u=Use[c]]>=0){// known rule/symbol
				for(;codes>>log2;)log2++;
				if(c>l)c=Rule[u];
				// CBT code
				if(c<(b=(1<<log2)-codes))s=log2-1;
				else s=log2,c+=b;
				b=0<d++;puts(b<<s|c,b+s);
				if(!p)return
			}else if(c>=0)// add rule/symbol to stack
				for(s=P[c],Stack[p++]=n=s.len,Stack[p++]=~u,n=p+=n;Stack[--n]=s.c,s=s.n;);
			else{// unknown rule
				n=Stack[--p]-1;
				if(d<3)puts(0,1);
				else if(d<4)puts(n>1,2);
				else b=s=l2b(n),(r=b>=l2b(d-1))||b++,puts(n^r<<b,++s*2-r);
				d-=n;Rule[~c]=codes++;
				if(!p)return
			}
	}
	var c,x,l,n,o=0,size=In.length,codes=0,log2=0,bits=32,data,
		Out=[],P=new Pairing(size,0,1),S,Rule=[],Use=[];
	if(!size)return done(Out,size,o);
	x=await P.symbolize(S=P.map(In,0,function(A,m,n,l,o,p,r,s){
		// write symbols by RLE+gamma code
		for(puts(A[l=n=o=0],1);n=l<256;puts(n,r)){
			for(m=A[l],p=l2b(256-l);m&&o++,m==A[++l]&&l<256;n++);
			r=s=l2b(m=+n);r<p||(n^=1<<r--);r-=m<256?~s:-1
		}
		for(codes=o;Rule[--o]=Use[o]=o;);
	}),1,rate);
	P=P.D;l=P.length;
	// mapping
	for(c=n=codes;n<l;n++)P[n]&&(Use[n]=c++);
	puts(l=l2b(x)+1,5);puts(x^1<<--l,l);
	// encoding loop
	for(l=codes-1,n=0;;){
		c=S[n++];
		if(~c){
			encodeCFG(Rule,Use,c);puts(0,1);
			if(!--x)break
		}
	}
	// incompressible
	if(o+4>size){
		Out.length=c=n=o=data=0;
		for(bits=32;size>>++c;);
		puts(c,14);puts(size^1<<--c,c);
		while(n<size)puts(In[n++],8)
	}
	if(bits<32)for(;data;data<<=8)Out[o++]=data>>>24;
	done&&done(Out,size,o);
	return Out
}
/*decompress
@In: input(Array / Uint8Array)
@done: call back of last process
	done(A,a,z)
	@A: decompressed array of @In
	@a: last position
	@z: decompressed size
@rate: call back of progress
	rate(a,z)
	@a: current position
	@z: last position
@return:
	call with await: decompressed array of @In
	without await: Promise object
*/
async function decompress(In,done,rate){
	function gets(l,n){
		n=data>>>32-l;
		if(l<bits)return l&&(data<<=l,bits-=l,n);
		l-=bits;bits=32-l;
		data=In[a++]<<24|In[a++]<<16|In[a++]<<8|In[a++];
		if(l)n|=data>>>bits,data<<=l;
		return n
	}
	var a=0, o=0, u=0, bits=0, data, rule, codes=0, log2=0, loop;
	const size=In.length, x=rate?32767:-1, fn=b=>setTimeout(c=>b(rate(a,size)),0),
		Out=[], Rule=[], Stack=[], Word=[], T=[];
	if(!size)return done(Out,size,o);
	// read symbols
	for(let e=0,f=gets(1);e<256;f^=1){
		let c=0,d=l2b(256-e);
		for(;c<d&&!gets(1);)++c;
		for(c=c<8?1<<c|gets(c):256;c--;e++)if(f)T[codes++]=e
	}
	loop=gets(5);loop=loop&&gets(--loop)|1<<loop;
	// decoding loop
	if(rule=codes)for(;loop;loop--&x||await new Promise(fn))
	for(let s=0;;)
		if(!s||gets(1)){// known rule/symbol
			for(;codes>>log2;)log2++;
			// CBT code
			let c=gets(log2-1),d=(1<<log2)-codes;
			if(c>=d)c+=c+gets(1)-d;
			// expand rule
			for(Stack[s++]=c;;c=Word[--u])
				if(c<rule){Out[o++]=T[c];if(!u)break}
				else for(d=Rule[c],c=d.length;c;)Word[u++]=d[--c]
		}else{// unknown rule
			if(s<2)break;
			let c=1,d;
			if(s===3)c+=gets(1);
			else if(s>3){
				for(c=0,d=l2b(s-1);c<d&&!gets(1);)c++;
				c=1<<c|gets(c)
			}
			for(d=Rule[codes]=[];d[c]=Stack[--s],c--;);
			Stack[s++]=codes++
	}else for(;o<loop;)Out[o++]=gets(8);
	done&&done(Out,size,o);
	return Out
}

文脈自由文法の作り方

//RePair風
(async()=>{
var A=[1,2,3,11,22,33,11,22,33,1,2,3],
p=new Pairing(A.length,0,false),
output=p.map(A),
size=await p.symbolize(output,false)
})()

//MR-RePair風
(async()=>{
var A=[1,2,3,11,22,33,11,22,33,1,2,3],
p=new Pairing(A.length,0,true),
output=p.map(A),
size=await p.symbolize(output,true)
})()

圧縮と伸長の例

(async()=>{
let A=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt());
let rate=(a,b)=>{console.log((a/b*1e4|0)/100,"%")}, done=(a,b,c)=>{console.log(b,"->",c,a)};
let e=await compress(A,done,rate), d=await decompress(e,done,rate);
console.log(e.length, String.fromCharCode(...d))
})()

codepenでfile処理。BTReePairなどと名乗ってやがります

See the Pen BTReePair by xezz (@xezz) on CodePen.

参考資料

TReePair:昔書いたごみ。C版はJavaScript版より可読性良いかも。MR-RePairを知る前にRePairを強引に書き換えて、規則拡張機能搭載した代物

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0