0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

圧縮とかDMC+LZP

0
Posted at

Dynamic Markov Coding(DMC)とLZPを組み合わせたfile圧縮programを紹介がてら思い知らせて差し上げる気満々の企画です。
DMCの部分は遷移確率が偏るようにMarkov連鎖の構造を変化させていくわけですが、その肝となるCloningの条件を工夫しています。
記号用とLZPにおける一致flagや一致長は異なるMarkov Modelを使って符号化。ちなみに初期化だけで重労働です。

実装

freehook.js
class FreeHookCompressor{
	constructor(){
		this.FCT = 1;
		this.maxsize = 1 << 26;// 64MB buffer
		// 状態変数
		this.filesize = 0n;
		this.memsize = this.LZ = this.avan = this.profile = this.exeat = 0;
		this.flimit1 = this.flimit2 = 0;
		//range coder
		this.x1 = this.x = this.ubi = 0
		this.x2 = 0xffffff;
		// profile選択用
		this.ri1=this.ri2=this.ri3=this.gl2=this.gl3=
		this.gsis1=this.gsis2=this.gsis3=
		this.di1=this.di2=this.di3=this.el2=this.el3=
		this.esis1=this.esis2=this.mixu=0;
		// block用
		this.block = null;// Uint8Array(file data block)
		this.lzp = null;   // Uint32Array(LZP hash table)
		// ADMC tree構造のflat配列
		this.branchmem = 
		this.ahead =  // [index * 2+0/1]
		this.cxt = null;  // [index * 2+0/1]
		this.wizard = this.addrb = this.maxbranch = this.limbra = this.addbra = 0;
		// LZP ADMC tree
		this.branchmem2 = this.ahead2 = this.cxt2 = null;
		this.wizard2 = this.addrb2 = this.maxbranch2 = this.limbra2 = this.addbra2 = 0
	}
	// Intel X86 EXE/DLL 用のfilter変換
	e8e9transform(n,t){
		const A=this.block,view = new DataView(A.buffer),l=n-4;
		for(let i = 0,a;i < l;){
			a = A[i++];
			if(a === 0xe8 || a === 0xe9){
				a = view.getInt32(i, true);
				if(t){
					if(a >= -i && a < n-i)a += i;
					else if(a > 0 && a < n)a -= n;
				}else if(a < 0){
					if(a+i >= 0)a += n
				}else if(a < n)a -= i;
				view.setInt32(i, a, true);
				i += 4
			}
		}
	}
	// ADMC 構造の初期化
	flush(A,B,cb){
		for(let a=cb+1,f=this.FCT,i,j=1<<cb,k,b=j-1,c=b>>1;j--;){
			for(i=0;i<c;A[k]=i<<a|j)A[k=i<<a|j<<1]=i+++i<<cb|j,B[k++]=B[k]=f;
			for(;i<b;A[k]=i+~c)A[k=i<<a|j<<1]=++i,B[k++]=B[k]=f
		}
		cb<9?(this.addrb=65279,this.wizard=0):(this.addrb2=1<<20,this.wizard2=0)
	}
	createbrain(msize){
		// C++の構造体幅は2つのpointerと2つのfloat = 16byte(32bit環境)or 24byte(64bit)
		// ここでは1要素あたりindex管理としてnode数を算出
		const n = Math.floor(msize / 16);
		this.ahead = new Int32Array(n * 2);
		this.cxt = new Float32Array(n * 2);
		this.maxbranch = n-20;
		this.limbra = (n / 32)* 24>>>0;
		this.addbra = (n-21)/ 32>>>0;
		this.flush(this.ahead,this.cxt,8)
	}
	createbrain2(msize){
		const n = Math.floor(msize / 16);
		this.ahead2 = new Int32Array(n * 2);
		this.cxt2 = new Float32Array(n * 2);
		this.maxbranch2 = n-20;
		this.limbra2 = (n / 32)* 24>>>0;
		this.addbra2 = (n-21)/ 32>>>0;
		this.flush(this.ahead2,this.cxt2,10)
	}
	bp(){
		const a=this.wizard2*2,c1=this.cxt2[a+1];
		return c1/(c1+this.cxt2[a]);
	}
	p(){
		const a=this.wizard*2,c1=this.cxt[a+1];
		return c1/(c1+this.cxt[a]);
	}
	update(y){
		let{cxt,ahead,wizard}=this,next_node = ahead[(wizard*=2)+y]<<1,
			n_c0 = cxt[next_node],
			n_c1 = cxt[next_node+1],
			w_cy = cxt[wizard+y],
			w_c0 = cxt[wizard],
			w_c1 = cxt[wizard+1],
			factor = w_cy/(n_c0+n_c1);
		if(this.addrb < this.maxbranch &&n_c0+n_c1>=(w_cy*factor)*(w_c0+w_c1)&&w_cy >=this.flimit1+factor*factor){
			const sum = ++this.addrb<<1;
			cxt[sum] = cxt[next_node] * factor;
			cxt[sum+1] = cxt[next_node+1] * factor;
			ahead[sum] = ahead[next_node];
			ahead[sum+1] = ahead[next_node+1];
			ahead[wizard+y] = sum>>1;
		}
		if(this.addrb === this.limbra && this.limbra+this.addbra <= this.maxbranch){
			this.flimit1 += 0.5;
			this.limbra += this.addbra;
		}
		cxt[wizard+=y]++;
		this.wizard = ahead[wizard];
	}
	pupdate(y){this.wizard = this.ahead[this.wizard * 2+y]}
	bupdate(y){
		let{cxt2,ahead2,wizard2}=this,next_node = ahead2[(wizard2*=2)+y]<<1;
		const n_c0 = cxt2[next_node],
			n_c1 = cxt2[next_node+1],
			w_cy = cxt2[wizard2+y],
			w_c0 = cxt2[wizard2],
			w_c1 = cxt2[wizard2+1],
			factor = w_cy /(n_c0+n_c1);
		if(this.addrb2 < this.maxbranch2 &&(n_c0+n_c1)>=(w_cy * factor)*(w_c0+w_c1)&&w_cy >= this.flimit2+factor * factor){
			const sum2 =++this.addrb2<<1;
			cxt2[sum2] = cxt2[next_node] * factor;
			cxt2[sum2+1] = cxt2[next_node+1] * factor;
			ahead2[sum2] = ahead2[next_node];
			ahead2[sum2+1] = ahead2[next_node+1];
			ahead2[wizard2+y] = sum2>>1
		}
		if(this.addrb2 === this.limbra2 && this.limbra2+this.addbra2 <= this.maxbranch2){
			this.flimit2 += 0.5;
			this.limbra2 += this.addbra2;
		}
		cxt2[wizard2+=y]++;
		this.wizard2 = ahead2[wizard2];
	}
	// file形式の識別と設定
	detectf(filename){
		const ext = filename.split('.').pop().toLowerCase();
		if(this.typec === 1){
			let[a,b,c,d]=this.block;
			this.profile = this.exeat = 0;
			if(a === 66 && b === 77)this.profile = 2;// bmp
			else if(b === 73 && c === 70 && d === 70)this.profile = 2;// ttf
			else if(a === 77 && b === 90 && c === 144)this.profile = 4;// exe
			else if(a === 37 && b === 80 && c === 68 && d === 71)this.profile = 7;// pdf
			else if(a === 63 && b === 95 && c === 3)this.profile = 5;// hlp
			else if(a === 208 && b === 207 && c === 17)this.profile = 6;// doc
			else if(a === 255 && b === 216 && c === 255)this.profile = 8;// jpg
			else if(a === 82 && b === 73 && c === 70 && d === 70)this.profile = 11;// wav
			if("log"==ext)this.profile = 1;
			else if({bmp:1,png:1}[ext])this.profile=2;
			else if("dic"==ext)this.profile=3;
			else if({exe:1,dll:1}[ext])this.profile=4;
			else if("hlp"==ext)this.profile=5;
			else if({doc:1,xls:1}[ext])this.profile=6;
			else if("pdf"==ext)this.profile=7;
			else if({jpg:1,jpeg:1}[ext])this.profile=8;
			else if("txt"==ext)this.profile=9;
			else if("iso"==ext)this.profile=10;
			else if("wav"==ext)this.profile=11;
			else if("save"==ext)this.profile=12;
		}
		// 各profileごとの内部数値を設定
		const profiles =[
			[1,1,4,7,14,1,1,1,1,1,2,7,14,1,1,2,0], // Default
			[2,2,5,7,14,1,1,1,1,1,2,7,14,1,1,4,0], // LOG
			[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,3,0],  // BMP
			[6,6,6,2,4, 1,1,0,4,4,4,4,8,1,1,2,0],  // DIC
			[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,2,1],  // EXE
			[0,0,0,8,16,1,1,1,0,0,8,4,16,1,1,4,0], // HLP
			[0,0,0,8,16,1,1,1,2,2,6,2,12,1,1,4,0], // DOC
			[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,4,0],  // PDF
			[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,1,0],  // JPG
			[4,4,4,4,8, 1,1,0,0,0,8,4,16,1,1,2,0], // TXT
			[1,1,4,7,14,1,1,1,0,0,8,4,16,1,1,2,0], // ISO
			[6,6,2,6,4, 1,1,1,2,2,6,2,12,1,1,4,0], // WAV
			[0,0,0,8,16,1,1,1,1,1,7,2,14,1,1,3,0]  // SAVE
		];
		const p = profiles[this.profile] || profiles[0];
		[this.ri1, this.ri2, this.ri3, this.gl2, this.gl3, this.gsis1, this.gsis2, this.gsis3,
		 this.di1, this.di2, this.el2, this.di3, this.el3, this.esis1, this.esis2, this.mixu, this.exeat] = p;
	}
	// 圧縮処理
	compress(fileBuffer, fileName, memsize = 256, limit = 2, avan = 5){
		this.typec = 1;
		this.memsize = memsize;
		this.LZ = avan>0;
		this.avan = avan;
		this.flimit1 = this.flimit2 = limit;
		const inputData = new Uint8Array(fileBuffer);
		this.filesize = BigInt(inputData.length);
		// 出力用動的配列
		let outChunks = [];
		const putc=byte=>{outChunks.push(byte)};
		const fwrite=bytes=>{for(let b of bytes)outChunks.push(b)};
		// header書き込み
		const header = new ArrayBuffer(12);
		const view = new DataView(header);
		view.setBigUint64(0, this.filesize, true);
		view.setUint16(8, this.memsize, true);
		view.setUint8(10, limit);
		view.setUint8(11, this.avan);
		fwrite(new Uint8Array(header));
		// file名書き込み
		const enc = new TextEncoder;
		const nameBytes = enc.encode(fileName);
		putc(nameBytes.length);
		fwrite(nameBytes);
		this.createbrain(this.memsize * 1000000);
		this.lzp = this.LZ&&new Uint32Array(1 << 24);
		this.x1 = 0;
		this.x2 = 0xffffff;
		let scanned = 0;
		// 算術符号化
		const encodeBit =y=>{
			let{x1,x2,ubi}=this,m;
			if(ubi)m=x1+(x2-x1)*this.bp()>>>0,this.bupdate(y);
			else m=x1+(x2-x1)*this.p()>>>0,this.update(y);
			if(m<x1)m=x1;
			if(m>=x2)m=x2-1;
			for(y?x2=m:x1=m+1;!((x1^x2)>>16);x2=x2<<8&0xffffff|255)putc(x2 >> 16),x1=x1<<8&0xffffff;
			this.x1=x1;this.x2=x2
		};
		const cwrite =c=>{
			this.ubi = 0;
			for(let i = 8;i;)encodeBit(c>>--i&1)
		};
		let detec = 0;
		while(scanned < inputData.length){
			let chunkLen = Math.min(this.maxsize, inputData.length-scanned);
			this.block = inputData.subarray(scanned, scanned+chunkLen);
			if(!detec){
				detec = 1;
				this.detectf(fileName);
				putc(this.profile);
			}
			if(this.exeat)this.e8e9transform(chunkLen,1);
			let pos = 0,umax = 0,ramax = 0;
			if(this.LZ){
				if(!this.memcre){
					this.createbrain2(1 << 25);
					this.memcre = 1;
				}
				this.lzp.fill(0);
				while(pos < chunkLen){
					let c = this.block[pos];
					let rc1=0xffffff&this.gsis1*(this.block[pos-1]>>this.ri1)+(this.gsis2*(this.block[pos-2]>>this.ri2)<<this.gl2)+(this.gsis3*(this.block[pos-3]>>this.ri3)<<this.gl3);
					let match;
					if(this.lzp[rc1]>this.mixu+2){
						let rpos = this.lzp[rc1]-1;
						match = true;
						for(let m = this.mixu;m;){
							if(this.block[rpos-m] !== this.block[pos-m--]){
								match = false;break;
							}
						}
					}
					if(!match)cwrite(c);
					else{
						let rposo = this.lzp[rc1]-1;
						this.wizard2 = 0xffffff&(this.block[pos-1]>>this.di1)+(this.esis1*(this.block[pos-2]>>this.di2)<<this.el2)+(this.esis2*(this.block[pos-3]>>this.di3)<<this.el3);
						m:for(ramax = umax = 0;pos+umax+this.avan <= chunkLen;ramax++,umax += this.avan)
							for(let s = 0;s < this.avan;s++)
								if(this.block[rposo+umax+s] !== this.block[pos+umax+s])break m;
						if(umax){
							for(let f = 0;f < umax;)
								for(let i = 8,c=this.block[rposo+f++];i;)
									this.pupdate(c>>--i&1);
							for(let i = this.ubi = 1,rumax = ramax;i < 32;rumax-=1<<i++){
								encodeBit(1);
								if(rumax <=1<<i){
									for(encodeBit(0);i;)encodeBit(rumax-1>>--i&1);
									break
								}
							}umax--
						}else this.ubi=1,encodeBit(0),cwrite(c)
					}
					this.lzp[rc1] = ++pos;
					pos += umax;
					umax = ramax = 0;
				}
			}else while(pos < chunkLen)cwrite(this.block[pos++]);
			scanned += chunkLen
		}
		for(let i = 3;i;)putc(this.x2>>--i*8&255);
		return new Uint8Array(outChunks).buffer;
	}
	// 展開処理用(圧縮済み ArrayBuffer を入力とし、元fileをobjectで返す)
	decompress(compressedBuffer){
		this.typec = 0;
		const inBytes = new Uint8Array(compressedBuffer);
		let inPtr = 0;
		const getc =()=> inBytes[inPtr++];
		const fread =len=>inBytes.subarray(inPtr,inPtr+=len);
		// header読み込み
		const headerView = new DataView(fread(12).buffer);
		this.filesize = headerView.getBigUint64(0, true);
		this.memsize = headerView.getUint16(8, true);
		this.flimit1 = this.flimit2 = headerView.getUint8(10);
		this.avan = headerView.getUint8(11);
		this.LZ=this.avan>0;
		// file名復元
		const nameLen = getc();
		const nameBytes = fread(nameLen);
		const dec = new TextDecoder;
		const outFileName = dec.decode(nameBytes);
		this.profile = getc();
		this.detectf(outFileName);
		this.createbrain(this.memsize * 1000000);
		this.x1 = 0;
		this.x2 = 0xffffff;
		this.x = 0;
		for(let i=3;i--;)this.x=this.x<<8|getc();
		const decodeBit=()=>{
			let{x1,x2,x,ubi}=this,m;
			m=ubi?x1+(x2-x1)*this.bp()>>>0:x1+(x2-x1)*this.p()>>>0;
			if(m<x1)m=x1;
			if(m>=x2)m=x2-1;
			let y=0|x<=m;
			ubi?this.bupdate(y):this.update(y);
			for(y?x2=m:x1=m+1;!((x1^x2)>>16);x2=x2<<8&0xffffff|255){
				x1=x1<<8&0xffffff;
				x=x<<8&0xffffff|getc()
			}this.x1=x1;this.x2=x2;this.x=x;
			return y
		};
		let outBuffer = new Uint8Array(Number(this.filesize)),outPtr = 0;
		const cread =(chunkBuf, pos)=>{
			let c = this.ubi = 0,i = 8;
			for(;i--;)c+=c+decodeBit();
			chunkBuf[pos] = c;
		};
		let scanned = 0;
		this.lzp = this.LZ&&new Uint32Array(1 << 24);
		while(scanned < outBuffer.length){
			let chunkLen = Math.min(this.maxsize, outBuffer.length-scanned);
			let chunkBuf = outBuffer.subarray(scanned);
			if(this.LZ){
				if(!this.memcre){
					this.createbrain2(1 << 25);
					this.memcre = 1;
				}
				this.lzp.fill(0);
				let pos = 0,umax = 0;
				while(pos < chunkLen){
					let rc1 =(this.gsis1 *(chunkBuf[pos-1] >> this.ri1))+
							 ((this.gsis2 *(chunkBuf[pos-2] >> this.ri2))<< this.gl2)+
							 ((this.gsis3 *(chunkBuf[pos-3] >> this.ri3))<< this.gl3);
					rc1 &= 0xffffff;
					let match;
					if(this.lzp[rc1]>this.mixu+2){
						let rpos = this.lzp[rc1]-1;
						match=1;
						for(let m = this.mixu;m;){
							if(chunkBuf[rpos-m] !== chunkBuf[pos-m--]){
								match = 0;break
							}
						}
					}
					if(!match)cread(chunkBuf, pos);
					else{
						this.wizard2=0xffffff&(chunkBuf[pos-1]>>this.di1)+(this.esis1*(chunkBuf[pos-2]>>this.di2)<<this.el2)+(this.esis2*(chunkBuf[pos-3]>>this.di3)<<this.el3);
						let ramax = 0,i = 0;
						for(this.ubi = 1;decodeBit();)ramax +=1 << i++;
						if(i){
							let c = 0;
							for(;i--;)c+=c+decodeBit();
							ramax += c
						}
						if(ramax){
							umax = ramax * this.avan-1;
							for(let l = 0,rposo = this.lzp[rc1]-1;l <= umax;l++)
								for(let i=8,c=chunkBuf[pos+l] = chunkBuf[rposo+l];i;)this.pupdate(c>>--i&1);
						}else cread(chunkBuf, pos)
					}
					this.lzp[rc1] = ++pos;
					pos += umax;umax = 0
				}
			}else for(let pos = 0;pos < chunkLen;)cread(chunkBuf, pos++);
			this.block = chunkBuf;
			if(this.exeat)this.e8e9transform(chunkLen);
			scanned += chunkLen
		}
		return{fileName:outFileName,fileData:outBuffer.buffer}
	}
}
GUI部分
<!DOCTYPE html><html>
<head>
	<meta charset=utf8>
	<title>Advanced DMC+LZP</title><script src="freehook.js"></script>
</head>
<body>
<h2>Advanced DMC + LZP</h2>
<fieldset>
	<legend>圧縮</legend>
	<input type=file id=f0>
	<button id=pack>file圧縮</button><br>
	memory<input id=mem value=32 size=3 title="0~255">
	limit<input id=lim value=2 size=2 title="0~255">
	LZstep<input id=lz value=4 size=3 title="0~255">
	
</fieldset>
<fieldset>
	<legend>展開</legend>
	<input type=file id=f1>
	<button id=undo>file展開</button>
</fieldset>
<input type=text id=info style="width:100%;border:0">
<p>変動費としてmemory * 100万bytes程度の空間消費、limitはDMCにおけるcloning条件を満たす閾値。高いほどcloningしにくい。LZstepはLZPにおける一致単位。0だとLZP無効、大きいとLZPが効きにくくなる
<script>
// 圧縮実行
pack.onclick=async()=>{
	if(!f0.files.length) return info.value='fileを選択しやがれ';
	let file = f0.files[0], codec = new FreeHookCompressor, b=await file.arrayBuffer();
	info.value="圧縮中...";
	// 引数: buffer, file名, memsize(MB), limit, avan
	const compressed = codec.compress(b, file.name, mem.value&255, lim.value&255,lz.value&255);
	info.value=b.byteLength+" -> "+compressed.byteLength;
	f0.value="";
	// download処理
	const a = document.createElement('a');
	a.download = file.name + '.fh';
	URL.revokeObjectURL(a.href=URL.createObjectURL(new Blob([compressed])),a.click())
};
// 展開実行
undo.onclick=async()=>{
	if (!f1.files.length) return info.value='fileを選択せんかい';
	const file = f1.files[0], b = await file.arrayBuffer(), codec = new FreeHookCompressor;
	info.value="展開中...";
	const result = codec.decompress(b);
	info.value=b.byteLength+" -> "+result.fileData.byteLength;
	f1.value="";
	// download処理
	const a = document.createElement('a');
	a.download = result.fileName;
	URL.revokeObjectURL(a.href=URL.createObjectURL(new Blob([result.fileData])),a.click())
}
</script>

動作検証

See the Pen freehook codec by xezz (@xezz) on CodePen.

蛇足

元ねたはMichael Eugene Ortmann氏のC言語版freehook v0.3です。作者のweb pageはとおっくの昔に消滅(残骸)、原作は入手困難。

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?