1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

BWT for Move to Front + Frequency Count

1
Last updated at Posted at 2026-03-08

圧縮の前処理であるBWTの後処理で更に圧縮しやすくする手法を紹介します。
有名なのはMove to Frontです。それだけでも記号頻度が小さい値に偏りまくりますが、頻度表(Frequency Count)を加えて更に偏らせようという作戦です。
ついでに直前の1文字に応じて順位表と頻度表を切り替えていきます。order1 MTF+FCといった所でしょうか。
欠点は短い文字列には効果薄めで、しかも計算量が莫大になる事です。memory消費もチョッピリオーメ(^-^)

実装編

JavaScriptなのです。

//初期化配列もどき
function All1(){}
function All0(){}
function N0_255(){}
//既定値設定
(function(o,p,q,r){for(;o;p[o]=1)q[r[--o]=o]=0})(256,All1.prototype,All0.prototype,N0_255.prototype);

/*変換/逆変換関数
@A: 変換元(array of byte)
@d: falsyなら変換、さもなければ逆変換(復号)
@return: 配列(array of byte)
*/
function MTFC(A,d){
	var a=0,c,C,i=0,j,o=0,rank,R,s=0,t,z=A.length,max,max1,O=[],
		S=0,R1=0,R2=0,R3=0,R4=0,
		nA=0,Mc=0,mc=256,sum=0,sum1=mc,
		Ms2r=new N0_255,Mr2s=new N0_255,Fs2r=new N0_255,Fr2s=new N0_255,
		SRC=[],SSC=[],SC=new All1,RC=new All1,SST=new All0,Freq=[];
	//順位表と頻度表設定
	for(;i<mc;sum+=Freq[i]=SRC[i][i]+SC[i]*RC[i++])
		SRC[i]=new All0,SSC[i]=new All0;
	//main loop
	for(;a<z;){
		c=Fs2r[s=A[a++]];
		if(d)s=c=Fr2s[s];
		O[o++]=c,rank=Ms2r[s];
		if(nA<=rank){
			if(++nA<5)max=1024,max1=832;
			else if(nA<33)max=2048,max1=416;
			else if(nA<129)max=4096,max1=832;
			else max=4096,max1=1024;
			if(s<mc)mc=s;
			if(Mc<=s)Mc=s+1
		}
		//頻度更新
		SC[s]+=3,sum1+=3,RC[i=rank]++,SRC[s][rank]+=3;
		SSC[S][s]+=t=rank?6:4;
		for(SST[S]+=t;0<i;Ms2r[Mr2s[i]=t]=i--)
			sum-=Freq[t=Mr2s[i-1]]-(Freq[t]=SRC[t][i]+SC[t]*RC[i]);
		sum-=Freq[s]-(Freq[s]=SRC[s][0]+SC[s]*RC[Ms2r[Mr2s[0]=s]=0]);

        C=SSC[S=s],R=rank;
		if(R<R1)R=R1;if(R<R2)R=R2;
		if(R<R3)R=R3;if(R<R4)R=R4;
		// 最近の最も大きい順位Rまで整列
		for(i=-1;i<R;Fs2r[Fr2s[j]=t]=j){
			c=Freq[t=Mr2s[++i]]+C[t];
			for(j=Fs2r[t];0<j&&Freq[Fr2s[j-1]]+C[Fr2s[j-1]]<c;)
				Fs2r[Fr2s[j]=Fr2s[j-1]]=j--}
		R4=R3,R3=R2,R2=R1,R1=rank;
		if(1<rank)c=max<=sum||514<sum1||40<RC[0]+RC[1],t=max1;
		else c=max<<4<=sum,t=max1<<4;
		//記号と順位の頻度削減
		if(c)for(i=mc;i<Mc;sum-=Freq[i]-(Freq[i]=R[j]+(RC[j]-=RC[j]>>1)*SC[i++]))
				R=SRC[i],c=R[j=Ms2r[i]]+1,
				R[j]=(c>>1)+(c>>2)+(c>>3)-(c>>5<<1)-(c>>6<<1)-(c>>7),
				sum1-=SC[i]-(SC[i]-=SC[i]>>1);
		//order1の頻度削減
		if(t<=SST[s])
			for(C=SSC[s],i=nA;SST[s]-=(c=C[t=Mr2s[--i]])-(C[t]=(c+1>>1)+(c>>2)+(c>>3)-(c>>4)-(c>>5)),i;);
	}return O
}
test
function BWT(T,U,n){
	for(var a=n=n||T.length,i,S=U||new Uint8Array(n),I=new Uint32Array(n);a;)I[--a]=a;
	I.sort(function(a,b,c,d){for(c=n;c--;)if(d=T[a++%n]-T[b++%n])return d});
	for(;a<n;S[a++]=T[--i])(i=I[a])||(S.i=a,i=n);
	return S
}
let txt=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt()),
	bwt=BWT(txt),
	enc=MTFC(bwt), dec=MTFC(enc,1);
document.write("enc ",enc,"<hr>dec ",dec,"<hr>bwt ",bwt)

benchmark

それではBWT+MTF/MTFCによる圧縮限界値(各記号の出現確率から求まる平均符号長の下限値 x 記号総数)はどうなるのか? Canterbury Corpusのfileで試してみます

file MTF MTFC
alice29.txt 48679 45797
asyoulik.txt 44647 42303
fields.c 3091 3084
lcet10.txt 125002 117746
plrabn12.txt 167184 156058

巨大fileになるほど差が顕著です

参考文献

本家本元C言語版

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?