Dynamic Markov Coding(DMC)とLZPを組み合わせたfile圧縮programを紹介がてら思い知らせて差し上げる気満々の企画です。
DMCの部分は遷移確率が偏るようにMarkov連鎖の構造を変化させていくわけですが、その肝となるCloningの条件を工夫しています。
記号用とLZPにおける一致flagや一致長は異なるMarkov Modelを使って符号化。ちなみに初期化だけで重労働です。
実装
freehook.js
class FreeHookCompressor{
constructor(){
this.FCT = 1;
this.maxsize = 1 << 26;// 64MB buffer
// 状態変数
this.filesize = 0n;
this.memsize = this.LZ = this.avan = this.profile = this.exeat = 0;
this.flimit1 = this.flimit2 = 0;
//range coder
this.x1 = this.x = this.ubi = 0
this.x2 = 0xffffff;
// profile選択用
this.ri1=this.ri2=this.ri3=this.gl2=this.gl3=
this.gsis1=this.gsis2=this.gsis3=
this.di1=this.di2=this.di3=this.el2=this.el3=
this.esis1=this.esis2=this.mixu=0;
// block用
this.block = null;// Uint8Array(file data block)
this.lzp = null; // Uint32Array(LZP hash table)
// ADMC tree構造のflat配列
this.branchmem =
this.ahead = // [index * 2+0/1]
this.cxt = null; // [index * 2+0/1]
this.wizard = this.addrb = this.maxbranch = this.limbra = this.addbra = 0;
// LZP ADMC tree
this.branchmem2 = this.ahead2 = this.cxt2 = null;
this.wizard2 = this.addrb2 = this.maxbranch2 = this.limbra2 = this.addbra2 = 0
}
// Intel X86 EXE/DLL 用のfilter変換
e8e9transform(n,t){
const A=this.block,view = new DataView(A.buffer),l=n-4;
for(let i = 0,a;i < l;){
a = A[i++];
if(a === 0xe8 || a === 0xe9){
a = view.getInt32(i, true);
if(t){
if(a >= -i && a < n-i)a += i;
else if(a > 0 && a < n)a -= n;
}else if(a < 0){
if(a+i >= 0)a += n
}else if(a < n)a -= i;
view.setInt32(i, a, true);
i += 4
}
}
}
// ADMC 構造の初期化
flush(A,B,cb){
for(let a=cb+1,f=this.FCT,i,j=1<<cb,k,b=j-1,c=b>>1;j--;){
for(i=0;i<c;A[k]=i<<a|j)A[k=i<<a|j<<1]=i+++i<<cb|j,B[k++]=B[k]=f;
for(;i<b;A[k]=i+~c)A[k=i<<a|j<<1]=++i,B[k++]=B[k]=f
}
cb<9?(this.addrb=65279,this.wizard=0):(this.addrb2=1<<20,this.wizard2=0)
}
createbrain(msize){
// C++の構造体幅は2つのpointerと2つのfloat = 16byte(32bit環境)or 24byte(64bit)
// ここでは1要素あたりindex管理としてnode数を算出
const n = Math.floor(msize / 16);
this.ahead = new Int32Array(n * 2);
this.cxt = new Float32Array(n * 2);
this.maxbranch = n-20;
this.limbra = (n / 32)* 24>>>0;
this.addbra = (n-21)/ 32>>>0;
this.flush(this.ahead,this.cxt,8)
}
createbrain2(msize){
const n = Math.floor(msize / 16);
this.ahead2 = new Int32Array(n * 2);
this.cxt2 = new Float32Array(n * 2);
this.maxbranch2 = n-20;
this.limbra2 = (n / 32)* 24>>>0;
this.addbra2 = (n-21)/ 32>>>0;
this.flush(this.ahead2,this.cxt2,10)
}
bp(){
const a=this.wizard2*2,c1=this.cxt2[a+1];
return c1/(c1+this.cxt2[a]);
}
p(){
const a=this.wizard*2,c1=this.cxt[a+1];
return c1/(c1+this.cxt[a]);
}
update(y){
let{cxt,ahead,wizard}=this,next_node = ahead[(wizard*=2)+y]<<1,
n_c0 = cxt[next_node],
n_c1 = cxt[next_node+1],
w_cy = cxt[wizard+y],
w_c0 = cxt[wizard],
w_c1 = cxt[wizard+1],
factor = w_cy/(n_c0+n_c1);
if(this.addrb < this.maxbranch &&n_c0+n_c1>=(w_cy*factor)*(w_c0+w_c1)&&w_cy >=this.flimit1+factor*factor){
const sum = ++this.addrb<<1;
cxt[sum] = cxt[next_node] * factor;
cxt[sum+1] = cxt[next_node+1] * factor;
ahead[sum] = ahead[next_node];
ahead[sum+1] = ahead[next_node+1];
ahead[wizard+y] = sum>>1;
}
if(this.addrb === this.limbra && this.limbra+this.addbra <= this.maxbranch){
this.flimit1 += 0.5;
this.limbra += this.addbra;
}
cxt[wizard+=y]++;
this.wizard = ahead[wizard];
}
pupdate(y){this.wizard = this.ahead[this.wizard * 2+y]}
bupdate(y){
let{cxt2,ahead2,wizard2}=this,next_node = ahead2[(wizard2*=2)+y]<<1;
const n_c0 = cxt2[next_node],
n_c1 = cxt2[next_node+1],
w_cy = cxt2[wizard2+y],
w_c0 = cxt2[wizard2],
w_c1 = cxt2[wizard2+1],
factor = w_cy /(n_c0+n_c1);
if(this.addrb2 < this.maxbranch2 &&(n_c0+n_c1)>=(w_cy * factor)*(w_c0+w_c1)&&w_cy >= this.flimit2+factor * factor){
const sum2 =++this.addrb2<<1;
cxt2[sum2] = cxt2[next_node] * factor;
cxt2[sum2+1] = cxt2[next_node+1] * factor;
ahead2[sum2] = ahead2[next_node];
ahead2[sum2+1] = ahead2[next_node+1];
ahead2[wizard2+y] = sum2>>1
}
if(this.addrb2 === this.limbra2 && this.limbra2+this.addbra2 <= this.maxbranch2){
this.flimit2 += 0.5;
this.limbra2 += this.addbra2;
}
cxt2[wizard2+=y]++;
this.wizard2 = ahead2[wizard2];
}
// file形式の識別と設定
detectf(filename){
const ext = filename.split('.').pop().toLowerCase();
if(this.typec === 1){
let[a,b,c,d]=this.block;
this.profile = this.exeat = 0;
if(a === 66 && b === 77)this.profile = 2;// bmp
else if(b === 73 && c === 70 && d === 70)this.profile = 2;// ttf
else if(a === 77 && b === 90 && c === 144)this.profile = 4;// exe
else if(a === 37 && b === 80 && c === 68 && d === 71)this.profile = 7;// pdf
else if(a === 63 && b === 95 && c === 3)this.profile = 5;// hlp
else if(a === 208 && b === 207 && c === 17)this.profile = 6;// doc
else if(a === 255 && b === 216 && c === 255)this.profile = 8;// jpg
else if(a === 82 && b === 73 && c === 70 && d === 70)this.profile = 11;// wav
if("log"==ext)this.profile = 1;
else if({bmp:1,png:1}[ext])this.profile=2;
else if("dic"==ext)this.profile=3;
else if({exe:1,dll:1}[ext])this.profile=4;
else if("hlp"==ext)this.profile=5;
else if({doc:1,xls:1}[ext])this.profile=6;
else if("pdf"==ext)this.profile=7;
else if({jpg:1,jpeg:1}[ext])this.profile=8;
else if("txt"==ext)this.profile=9;
else if("iso"==ext)this.profile=10;
else if("wav"==ext)this.profile=11;
else if("save"==ext)this.profile=12;
}
// 各profileごとの内部数値を設定
const profiles =[
[1,1,4,7,14,1,1,1,1,1,2,7,14,1,1,2,0], // Default
[2,2,5,7,14,1,1,1,1,1,2,7,14,1,1,4,0], // LOG
[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,3,0], // BMP
[6,6,6,2,4, 1,1,0,4,4,4,4,8,1,1,2,0], // DIC
[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,2,1], // EXE
[0,0,0,8,16,1,1,1,0,0,8,4,16,1,1,4,0], // HLP
[0,0,0,8,16,1,1,1,2,2,6,2,12,1,1,4,0], // DOC
[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,4,0], // PDF
[0,0,0,8,16,1,1,1,4,4,4,4,8,1,1,1,0], // JPG
[4,4,4,4,8, 1,1,0,0,0,8,4,16,1,1,2,0], // TXT
[1,1,4,7,14,1,1,1,0,0,8,4,16,1,1,2,0], // ISO
[6,6,2,6,4, 1,1,1,2,2,6,2,12,1,1,4,0], // WAV
[0,0,0,8,16,1,1,1,1,1,7,2,14,1,1,3,0] // SAVE
];
const p = profiles[this.profile] || profiles[0];
[this.ri1, this.ri2, this.ri3, this.gl2, this.gl3, this.gsis1, this.gsis2, this.gsis3,
this.di1, this.di2, this.el2, this.di3, this.el3, this.esis1, this.esis2, this.mixu, this.exeat] = p;
}
// 圧縮処理
compress(fileBuffer, fileName, memsize = 256, limit = 2, avan = 5){
this.typec = 1;
this.memsize = memsize;
this.LZ = avan>0;
this.avan = avan;
this.flimit1 = this.flimit2 = limit;
const inputData = new Uint8Array(fileBuffer);
this.filesize = BigInt(inputData.length);
// 出力用動的配列
let outChunks = [];
const putc=byte=>{outChunks.push(byte)};
const fwrite=bytes=>{for(let b of bytes)outChunks.push(b)};
// header書き込み
const header = new ArrayBuffer(12);
const view = new DataView(header);
view.setBigUint64(0, this.filesize, true);
view.setUint16(8, this.memsize, true);
view.setUint8(10, limit);
view.setUint8(11, this.avan);
fwrite(new Uint8Array(header));
// file名書き込み
const enc = new TextEncoder;
const nameBytes = enc.encode(fileName);
putc(nameBytes.length);
fwrite(nameBytes);
this.createbrain(this.memsize * 1000000);
this.lzp = this.LZ&&new Uint32Array(1 << 24);
this.x1 = 0;
this.x2 = 0xffffff;
let scanned = 0;
// 算術符号化
const encodeBit =y=>{
let{x1,x2,ubi}=this,m;
if(ubi)m=x1+(x2-x1)*this.bp()>>>0,this.bupdate(y);
else m=x1+(x2-x1)*this.p()>>>0,this.update(y);
if(m<x1)m=x1;
if(m>=x2)m=x2-1;
for(y?x2=m:x1=m+1;!((x1^x2)>>16);x2=x2<<8&0xffffff|255)putc(x2 >> 16),x1=x1<<8&0xffffff;
this.x1=x1;this.x2=x2
};
const cwrite =c=>{
this.ubi = 0;
for(let i = 8;i;)encodeBit(c>>--i&1)
};
let detec = 0;
while(scanned < inputData.length){
let chunkLen = Math.min(this.maxsize, inputData.length-scanned);
this.block = inputData.subarray(scanned, scanned+chunkLen);
if(!detec){
detec = 1;
this.detectf(fileName);
putc(this.profile);
}
if(this.exeat)this.e8e9transform(chunkLen,1);
let pos = 0,umax = 0,ramax = 0;
if(this.LZ){
if(!this.memcre){
this.createbrain2(1 << 25);
this.memcre = 1;
}
this.lzp.fill(0);
while(pos < chunkLen){
let c = this.block[pos];
let rc1=0xffffff&this.gsis1*(this.block[pos-1]>>this.ri1)+(this.gsis2*(this.block[pos-2]>>this.ri2)<<this.gl2)+(this.gsis3*(this.block[pos-3]>>this.ri3)<<this.gl3);
let match;
if(this.lzp[rc1]>this.mixu+2){
let rpos = this.lzp[rc1]-1;
match = true;
for(let m = this.mixu;m;){
if(this.block[rpos-m] !== this.block[pos-m--]){
match = false;break;
}
}
}
if(!match)cwrite(c);
else{
let rposo = this.lzp[rc1]-1;
this.wizard2 = 0xffffff&(this.block[pos-1]>>this.di1)+(this.esis1*(this.block[pos-2]>>this.di2)<<this.el2)+(this.esis2*(this.block[pos-3]>>this.di3)<<this.el3);
m:for(ramax = umax = 0;pos+umax+this.avan <= chunkLen;ramax++,umax += this.avan)
for(let s = 0;s < this.avan;s++)
if(this.block[rposo+umax+s] !== this.block[pos+umax+s])break m;
if(umax){
for(let f = 0;f < umax;)
for(let i = 8,c=this.block[rposo+f++];i;)
this.pupdate(c>>--i&1);
for(let i = this.ubi = 1,rumax = ramax;i < 32;rumax-=1<<i++){
encodeBit(1);
if(rumax <=1<<i){
for(encodeBit(0);i;)encodeBit(rumax-1>>--i&1);
break
}
}umax--
}else this.ubi=1,encodeBit(0),cwrite(c)
}
this.lzp[rc1] = ++pos;
pos += umax;
umax = ramax = 0;
}
}else while(pos < chunkLen)cwrite(this.block[pos++]);
scanned += chunkLen
}
for(let i = 3;i;)putc(this.x2>>--i*8&255);
return new Uint8Array(outChunks).buffer;
}
// 展開処理用(圧縮済み ArrayBuffer を入力とし、元fileをobjectで返す)
decompress(compressedBuffer){
this.typec = 0;
const inBytes = new Uint8Array(compressedBuffer);
let inPtr = 0;
const getc =()=> inBytes[inPtr++];
const fread =len=>inBytes.subarray(inPtr,inPtr+=len);
// header読み込み
const headerView = new DataView(fread(12).buffer);
this.filesize = headerView.getBigUint64(0, true);
this.memsize = headerView.getUint16(8, true);
this.flimit1 = this.flimit2 = headerView.getUint8(10);
this.avan = headerView.getUint8(11);
this.LZ=this.avan>0;
// file名復元
const nameLen = getc();
const nameBytes = fread(nameLen);
const dec = new TextDecoder;
const outFileName = dec.decode(nameBytes);
this.profile = getc();
this.detectf(outFileName);
this.createbrain(this.memsize * 1000000);
this.x1 = 0;
this.x2 = 0xffffff;
this.x = 0;
for(let i=3;i--;)this.x=this.x<<8|getc();
const decodeBit=()=>{
let{x1,x2,x,ubi}=this,m;
m=ubi?x1+(x2-x1)*this.bp()>>>0:x1+(x2-x1)*this.p()>>>0;
if(m<x1)m=x1;
if(m>=x2)m=x2-1;
let y=0|x<=m;
ubi?this.bupdate(y):this.update(y);
for(y?x2=m:x1=m+1;!((x1^x2)>>16);x2=x2<<8&0xffffff|255){
x1=x1<<8&0xffffff;
x=x<<8&0xffffff|getc()
}this.x1=x1;this.x2=x2;this.x=x;
return y
};
let outBuffer = new Uint8Array(Number(this.filesize)),outPtr = 0;
const cread =(chunkBuf, pos)=>{
let c = this.ubi = 0,i = 8;
for(;i--;)c+=c+decodeBit();
chunkBuf[pos] = c;
};
let scanned = 0;
this.lzp = this.LZ&&new Uint32Array(1 << 24);
while(scanned < outBuffer.length){
let chunkLen = Math.min(this.maxsize, outBuffer.length-scanned);
let chunkBuf = outBuffer.subarray(scanned);
if(this.LZ){
if(!this.memcre){
this.createbrain2(1 << 25);
this.memcre = 1;
}
this.lzp.fill(0);
let pos = 0,umax = 0;
while(pos < chunkLen){
let rc1 =(this.gsis1 *(chunkBuf[pos-1] >> this.ri1))+
((this.gsis2 *(chunkBuf[pos-2] >> this.ri2))<< this.gl2)+
((this.gsis3 *(chunkBuf[pos-3] >> this.ri3))<< this.gl3);
rc1 &= 0xffffff;
let match;
if(this.lzp[rc1]>this.mixu+2){
let rpos = this.lzp[rc1]-1;
match=1;
for(let m = this.mixu;m;){
if(chunkBuf[rpos-m] !== chunkBuf[pos-m--]){
match = 0;break
}
}
}
if(!match)cread(chunkBuf, pos);
else{
this.wizard2=0xffffff&(chunkBuf[pos-1]>>this.di1)+(this.esis1*(chunkBuf[pos-2]>>this.di2)<<this.el2)+(this.esis2*(chunkBuf[pos-3]>>this.di3)<<this.el3);
let ramax = 0,i = 0;
for(this.ubi = 1;decodeBit();)ramax +=1 << i++;
if(i){
let c = 0;
for(;i--;)c+=c+decodeBit();
ramax += c
}
if(ramax){
umax = ramax * this.avan-1;
for(let l = 0,rposo = this.lzp[rc1]-1;l <= umax;l++)
for(let i=8,c=chunkBuf[pos+l] = chunkBuf[rposo+l];i;)this.pupdate(c>>--i&1);
}else cread(chunkBuf, pos)
}
this.lzp[rc1] = ++pos;
pos += umax;umax = 0
}
}else for(let pos = 0;pos < chunkLen;)cread(chunkBuf, pos++);
this.block = chunkBuf;
if(this.exeat)this.e8e9transform(chunkLen);
scanned += chunkLen
}
return{fileName:outFileName,fileData:outBuffer.buffer}
}
}
GUI部分
<!DOCTYPE html><html>
<head>
<meta charset=utf8>
<title>Advanced DMC+LZP</title><script src="freehook.js"></script>
</head>
<body>
<h2>Advanced DMC + LZP</h2>
<fieldset>
<legend>圧縮</legend>
<input type=file id=f0>
<button id=pack>file圧縮</button><br>
memory<input id=mem value=32 size=3 title="0~255">
limit<input id=lim value=2 size=2 title="0~255">
LZstep<input id=lz value=4 size=3 title="0~255">
</fieldset>
<fieldset>
<legend>展開</legend>
<input type=file id=f1>
<button id=undo>file展開</button>
</fieldset>
<input type=text id=info style="width:100%;border:0">
<p>変動費としてmemory * 100万bytes程度の空間消費、limitはDMCにおけるcloning条件を満たす閾値。高いほどcloningしにくい。LZstepはLZPにおける一致単位。0だとLZP無効、大きいとLZPが効きにくくなる
<script>
// 圧縮実行
pack.onclick=async()=>{
if(!f0.files.length) return info.value='fileを選択しやがれ';
let file = f0.files[0], codec = new FreeHookCompressor, b=await file.arrayBuffer();
info.value="圧縮中...";
// 引数: buffer, file名, memsize(MB), limit, avan
const compressed = codec.compress(b, file.name, mem.value&255, lim.value&255,lz.value&255);
info.value=b.byteLength+" -> "+compressed.byteLength;
f0.value="";
// download処理
const a = document.createElement('a');
a.download = file.name + '.fh';
URL.revokeObjectURL(a.href=URL.createObjectURL(new Blob([compressed])),a.click())
};
// 展開実行
undo.onclick=async()=>{
if (!f1.files.length) return info.value='fileを選択せんかい';
const file = f1.files[0], b = await file.arrayBuffer(), codec = new FreeHookCompressor;
info.value="展開中...";
const result = codec.decompress(b);
info.value=b.byteLength+" -> "+result.fileData.byteLength;
f1.value="";
// download処理
const a = document.createElement('a');
a.download = result.fileName;
URL.revokeObjectURL(a.href=URL.createObjectURL(new Blob([result.fileData])),a.click())
}
</script>
動作検証
See the Pen freehook codec by xezz (@xezz) on CodePen.
蛇足
元ねたはMichael Eugene Ortmann氏のC言語版freehook v0.3です。作者のweb pageはとおっくの昔に消滅(残骸)、原作は入手困難。