0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

緩いByte Pair Encoding

Last updated at Posted at 2025-12-21

出現頻度の統計を1回しかとらないByte Pair Encodingを紹介します。原理上高速ですが圧縮率はいまいち。圧縮区間は可変長です(良い圧縮率になりそうな幅を自動計算)。固定長の方が遥かに高速なので、そうしたければ勝手に改造すべし

/*
	BPEncode(A,w)
	@A	:数値配列(0..255)
	@w	:最小区間幅(0-4095)+256
	返値	:数値配列(0..255)

	BPDecode(A)
	@A	:数値配列(0..255)
	返値	:数値配列(0..255)
*/
//encode a block
function bpe(A,i,n,l,B){
	if(n>l)n=l;
	//Generate U and F hash
	for(var a=i,b=0,c,d=256,e,h=255,p,H={},I=[],U=new Uint32Array(d),F=new Uint32Array(d*d);i<n;d=c){
		++U[c=A[i++]];
		if(++F[p=d*256+c]===3)I[b++]=p
	}
	// select BPE marker
	for(d=256;d;i>p&&(i=p,e=d))p=U[--d],p&&h--;
	h<0&&h++;
	//Generate H
	if(b>h)I.sort(function(a,b){return F[b]-F[a]||a-b}),I.length=h;
	for(i=b=0;i<h;H[p]=[b++])
		for(p=I[i++];U[b];)b++;
	//Encode the string
	i=a;d=-1;
	if(B){B[b=B.length]=n-a>>8;
		for(B[++b]=n-a&255,B[++b]=e;i<n;d=c)if(c=A[i++],~d)
			if(h=H[p=d*256+c]){
				//Encode this byte pair
				if(h[1])B[++b]=h[0];
				else B[++b]=h[1]=e,B[++b]=h[0],B[++b]=d,B[++b]=c;
				//Skip ahead by one sibnce the current byte is already encoded
				c=A[i++];if(i===n)B[++b]=c,c^e||(B[++b]=c)
			}else{
				B[++b]=d,d^e||(B[++b]=d);
				if(i===n)B[++b]=c,c^e||(B[++b]=c)
			}
	}else for(b=2;i<n;d=c)if(c=A[i++],~d)
		if(h=H[p=d*256+c])h[1]?++b:h[1]=b+=4,c=A[i++],i<n||(b+=c^e?1:2);
		else++b,d^e||++b,i<n||(b+=c^e?1:2);
	return B?i:b}

// encode all
function BPEncode(A,w){
	w&=4095;w+=256;
	for(var i=0,b,c,l=A.length,n,O=[];i<l;i=bpe(A,i,i+n,l,O))
		for(n=i+w>l?l-i:w;;n*=2){// extend block size
			b=bpe(A,i,i+n,l),c=b+1;
			if(i+n<l)
				b+=bpe(A,i+n,i+n+n,l),
				c=bpe(A,i,i+n+n,l);
			if(b<c||n>65535)break
		}
	return O
}
// decode
function BPDecode(A){
	for(var c,e,i=0,l,o=0,O=[],H=[];l=A[i++]<<8|A[i++];H.length=0)for(e=A[i++],l+=o;o<l;)
		if(H[c=A[i++]])c=H[c],O[o++]=c[0],O[o++]=c[1];
		else if(c^e)O[o++]=c;
		else(c=A[i++])^e?H[c]=[O[o++]=A[i++],O[o++]=A[i++]]:O[o++]=c;
	return O
}
test
let data=Array.from("Peter piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked",a=>a.charCodeAt()),
	e=BPEncode(data), d=BPDecode(e);
console.log(data.length,"->",e.length,"decode",d)

codepen的試作品

See the Pen lite BPE compressor by xezz (@xezz) on CodePen.

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?