今回は拡張接尾辞配列に基づく最長優先の文脈自由文法構築術を紹介します。最長優先とは共通部分列の長さを優先するという事です。githubにcfg-esaというものが転がっているので参考にして下さい。
この記事ではJavaScriptを駆使して、どのような文脈自由文法を構築するのかお見せしましょう。
実装編
拡張接尾辞配列を構築するためにSAIS様を召喚。
////////////////////////
// Induced Sorting(SAIS)
function getBuckets(B,C,k){
for(var i=0,s=0;i<k;)B[i]=s+=C[i++];
}
function induceLS(SA,A,T,C,B,n,k){
for(var i=0,j=0;i<k;j+=C[i++])B[i]=j;
for(i=0,SA[B[A[n-1]]++]=n-1;i<n;j<1||T[--j]||(SA[B[A[j]]++]=j))j=SA[i++];
for(getBuckets(B,C,k);i;j<1||T[--j]&&(SA[--B[A[j]]]=j))j=SA[--i]
}
function sais(SA,A,n,k){
for(var i=n,j=n-1,l=j,m=n-2,o,p,n1=0,d,B=new Uint32Array(k),C=new Uint32Array(k),T=new Uint8Array(n);i;SA[i]=-1)++C[p=A[--i]],T[i]=p<A[i+1]||p===A[i+1]&&T[i+1];
for(getBuckets(B,C,k);j;)if(T[--j]&&!T[j-1])SA[--B[A[j]]]=j;
induceLS(SA,A,T,C,B,n,k);
for(;j<n;i<1||T[i-1]||T[i]&&(SA[n1++]=i))i=SA[j++];
for(var S=i=n1,name=n1>0|0;i<n;)SA[i++]=-1;
if(S)for(;T[m-1]||!T[m];)--m;
for(o=SA[i=0],SA[S+(o>>1)]=0;++i<n1;SA[S+(p>>1)]=name-1){
d=p=SA[i];
if(p^m&&o^m)
for(j=o;d<n;++j){
if(A[d]^A[j]||T[d]^T[j]){o=p;break}
if(T[d++]&&!T[d-2]&&j>o)break}
else o=p;
o^p||++name
}
for(i=j=n,S=n-n1;j>S;)SA[--i]<0||(SA[--j]=SA[i]);
S=SA.subarray(S,n);
if(name<n1)sais(SA,S,n1,name);
else for(i=n1;i;)SA[S[--i]]=i;
for(i=j=0;++i<l;)if(T[i]&&!T[i-1])S[j++]=i;
for(i=0;i<n1;)SA[i]=S[SA[i++]];
for(getBuckets(B,C,k);n1<n;)SA[n1++]=-1;
for(;i;SA[--B[A[j]]]=j)j=SA[--i],SA[i]=-1;
induceLS(SA,A,T,C,B,n,k);return SA
}
/////////
// suffix
function LcpTreeNode(p,c,l,r){this.dad=p;this.lcp=c;this.left=l;this.right=r}
const LcpTreeCreator={
createLcpTree:function(lcp,N){
for(var i=0,c,l,n,o,s=0,lcpTree=[new LcpTreeNode(0,0,0,N-1)],ts=0,S=[{left:0,lcp:0,i:0}];i<N;){
for(l=i++,c=s;lcp[i]<S[s].lcp;)
o=S[s--],lcpTree[n=o.i]=new LcpTreeNode(S[s].i,o.lcp,l=o.left,i-1);
if(lcp[i]>S[s].lcp){
S[++s]={left:l,lcp:lcp[i],i:++ts};
if(~c+s)lcpTree[n].dad=ts
}
}return lcpTree
},
getStats:function(tree){
for(var i=0,p=0,n,l=tree.length;i<l;){
n=tree[i++];
if(n.lcp>1)p+=n.right-n.left+1;
}return{p:p,size:l}
},
printIntervalTree:function(tree){
for(var i=0,l=tree.length,n;i<l;console.log(n.left,n.right,n.lcp,n.dad))
n=tree[i++]
}};
function createLCPArray(S,SA,LCP,N){
if(LCP)return LCP;
LCP=new Int32Array(N+1);
for(var i=N,l=0,k,s,ISA=SA.slice();i--;)ISA[SA[i]]=i;
for(;++i<N;LCP[s]=l,l=l>1?l-1:0)
if(s=ISA[i])for(k=SA[s-1];i+l<N&&k+l<N&&S[i+l]===S[k+l];)l++;
else l=0;
LCP[N]=0;
return LCP
}
function createLPFArray(SA,LCP,LPF,LPFp,N){
if(LPF)return LPF;
LPF=new Int32Array(N),LPFp=LPF.slice();
for(var i=0,NO_PREV=-1>>>1,s=LCP[N]=0,S=[{pos:0,lpfPos:NO_PREV}];i<N;){
var newpos={pos:++i,lpfPos:NO_PREV};
do{
var end=i===N,top=S[s].pos;
if(end||SA[i]<SA[top]||SA[i]>SA[top]&&LCP[i]<=LCP[top]){
if(end||SA[i]<SA[top]){
if(LCP[top]>LCP[i])
LPF[SA[top]]=LCP[top],LPFp[SA[top]]=S[s].lpfPos;
else LPF[SA[top]]=LCP[i],LPFp[SA[top]]=SA[i];
LCP[i]=Math.min(LCP[top],LCP[i])
}else LPF[SA[top]]=LCP[top],LPFp[SA[top]]=S[s].lpfPos
}else{newpos.lpfPos=SA[top];break}
}while(s--);
if(i<N)S[++s]=newpos
}return LPF
}
//////////
// grammar
//create string representation of the grammar
function gram2str(g){
for(var i=0,s="",l=g.length;i<l;)s+=ruleID(i)+":"+ruleString(g,i++)+"\n";
return s
}
function expand(g,r){
for(var i=0,f,s="",rule=g[r],n=rule.length,S=String.fromCharCode;i<n;s+=f.length?S.apply(0,f):expand(g,f))
f=rule[i++];
return s
}
function ruleHits(g){
for(var i=0,h,n,r,s,l=g.length,C=new Uint32Array(l);i<l;)
for(h=0,r=g[i++],n=r.length;h<n;s.length||C[s]++)s=r[h++];
return C
}
function ruleRank(C,cs){
for(var i=C.length,l=i,I=new Uint32Array(i);I[--i]=i;);
I.sort(function(a,b){return C[b]-C[a]});
for(cs>>>=0;l;)C[I[--l]]=l+cs
}
function ruleID(i){return"[R"+i+"]"}
function ruleString(g,i){
var f,s="",rule=g[i],n=rule.length,S=String.fromCharCode;
for(i=0;i<n;s+=f.length?S.apply(0,f):ruleID(f))f=rule[i++];
return s
}
function calcCfgSize(g){
for(var i=0,f,n=g.length;i<n;)
for(var rule=g[i++],j=0,l=rule.length,r=0,s=0;j<l;f.length?s+=f.length:r++)
f=rule[j++];
g.numNonterms=r,g.numTerms=s
}
///////
// sort
function isort(A,i,l){
for(var a=i,c,j,k;i<l;A[j]=c)
if((c=A[j=k=++i])<A[a])for(;j>a;)A[j]=A[--j];
else for(;c<A[--k];)A[j]=A[j=k]}
function qsort(A,a,b){
if(b-a<9)return isort(A,a,b);
var c=A[a],d,i=A[b],j=a,k=b,p=A[a+b>>1];
c>i?c>p?p>i||(p=i):p=c:i>p?c>p&&(p=c):p=i;
for(i=a;j<=k;)
if((c=A[j])>p)A[j]=A[k],A[k--]=c;
else{if(c<p)d=A[i],A[i++]=c,A[j]=d;j++}
a<--i&&qsort(A,a,i);++k<b&&qsort(A,k,b)}
///////////
// compress
function Rule(b,e,p){this.begin=b;this.end=e;this.p=p}
Rule.prototype.length=function(){return this.end-this.begin+1};
function LongestFirstSaCompressor(s,l,c){
this.str=s;this.cs=c&-1||255;this.N=l;this.rules=[]
}
LongestFirstSaCompressor.prototype={
NULL_SUBSTRING:-1,
NO_RULE:1<<31,NO_POS:1<<31,
NO_LOCAL_SHORT:-1,
UNREPLACED:-1>>>1,NO_PREFIX_RULE:1<<31,
//create new rule of length l starting at position p
writeRule:function(rule,p,l){
var R=this.rules,T=this.data,i=this.NULL_SUBSTRING,pp=p.pos,ss={start:i,end:i};
if(p.rule===this.NO_RULE){
for(l+=i=pp;i<l;)if(T[i++]!==this.UNREPLACED)return ss;
for(T[i=pp]=rule;++i<l;)T[i]=-pp;
ss.start=pp;ss.end=l-1;
return ss
}
i=R[p.rule].p;
if(i!==this.NO_PREFIX_RULE&&pp<R[i].length())return ss;//overlap found
for(var begin=R[p.rule].begin,spos=i=begin+pp;i<spos+l;++i)
if(i>begin&&T[i]>0||T[i]<1&&-T[i]^begin)return ss;
ss.start=spos;ss.end=spos+l-1;
if(pp)for(T[i=spos]=rule;--l;)T[++i]=-spos;
else R[p.rule].p=rule;
return ss
},
createNewRule:function(p,l){
var rule=this.numRules++,ss=this.writeRule(rule,p,l);
this.rules[rule]=new Rule(ss.start,ss.end,this.NO_PREFIX_RULE);
return rule
},
makeReplacements:function(R,lcp){
for(var i=0,p,rule=this.createNewRule(R[0],lcp);p=R[++i];)this.writeRule(rule,p,lcp)
},
//return true if substrings starting at p1 and p2 with length l do not overlap
noOverlap:function(p1,p2,l){
l+=p1<p2?p1:p2,p1>p2&&(p2=p1);
return l<=p2
},
//return true if substrings starting at p1 and p2 with length lcp can be replaced with a new rule,ie they are not overlapping and are not identical positions within the same rule assumes that substrings of length lcp at both positions are within the same rule as the position
rulePossible:function(p1,p2,lcp){
return p1.rule!==this.NO_RULE&&p2.rule!==this.NO_RULE?p1.rule!==p2.rule||this.noOverlap(p1.pos,p2.pos,lcp):p1.rule!==this.NO_RULE||p2.rule!==this.NO_RULE||this.noOverlap(p1.pos,p2.pos,lcp)
},
isInRule:function(pos){
return{rule:this.data[pos]^this.UNREPLACED?(pos=0,1):this.NO_RULE,pos:pos}
},
//add shortened position to local(one lcp interval)list
addLocalShortened:function(pos,l){
this.localShortPos[l].push(pos);
if(this.localMin===this.NO_LOCAL_SHORT||l<this.localMin)this.localMin=l;
if(this.localMax===this.NO_LOCAL_SHORT||l>this.localMax)this.localMax=l
},
//for a position within a string,find rule containing it and its relative position within the that rule. the rule must be at lowest level,ie no other sub-rules containing the position
findInRulePosition:function(pos){
if(this.data[pos]===this.UNREPLACED)
return{rule:this.NO_RULE,pos:pos}
var R=this.rules,T=this.data,s=T[pos],rule=T[s=s<1?-s:pos];
for(pos-=s;;){
if(!pos){
for(;R[rule].p!==this.NO_PREFIX_RULE;)rule=R[rule].p;
return{rule:rule,pos:0}
}
var r=R[rule],apos=r.begin+pos;
s=T[apos];
if(s<1){
if(-s^r.begin){rule=T[s=-s];pos=apos-s;continue}
}else if(s^rule){rule=s;pos=0;continue}
if(r.p===this.NO_PREFIX_RULE)return{rule:rule,pos:pos};
s=R[r.p];
if(pos>s.end-s.begin)return{rule:rule,pos:pos}
rule=r.p
}
},
//after processing lcp interval,add shortened position to global bookkeeping if there exists more than one position
processLocalShortened:function(){
if(this.localMin===this.NO_LOCAL_SHORT)return;
for(var i=this.localMax,l,n=0,p,L=[],Lp=this.localShortPos;i>=this.localMin;--i)
if(l=Lp[i].length){
for(p=0;p<l;)L[n++]={l:i,pos:Lp[i][p++]};
Lp[i].length=0
}
n>1&&this.shortPos[L[1].l].push(L)
},
processInterval:function(i){
var N=this.lcpTree,node=N[i],first={pos:this.NO_POS},up,R=[],len=node.right-node.left+1,lcp=node.lcp,S=this.SA.slice(i=node.left,i+len);
this.localMin=this.localMax=this.NO_LOCAL_SHORT;
for(qsort(S,i=0,len-1);i<len;){
var bpos=S[i++],b=this.isInRule(bpos),e=this.isInRule(bpos+lcp-1);
if(b.rule===this.NO_RULE&&e.rule===this.NO_RULE){
if(up)R.push(b);
else if(first.pos===this.NO_POS)
first=b,this.addLocalShortened(bpos,lcp);
else this.rulePossible(first,b,lcp)&&R.push(first,up=b)
}
else if(b.rule===this.NO_RULE&&e.rule!==this.NO_RULE){
for(b=bpos-1+lcp;(e=this.data[b])!==this.UNREPLACED;)e<1?b=-e:b--;
b-=bpos-1,b>N[node.dad].lcp&&b>1&&this.addLocalShortened(bpos,b)
}
else if(b.rule!==this.NO_RULE&&e.rule!==this.NO_RULE){
b=this.findInRulePosition(bpos);
e=this.findInRulePosition(bpos+lcp-1);
if(b.rule===e.rule&&e.pos===b.pos+lcp-1)
if(up)R.push(b);
else if(first.pos===this.NO_POS)
first=b,this.addLocalShortened(bpos,lcp);
else this.rulePossible(first,b,lcp)&&R.push(first,up=b)
}
}
up&&this.makeReplacements(R,lcp);
this.processLocalShortened()
},
//attempt to construct new rules from a list of shortened segments coming from same lcp interval
processShortened:function(spos,len){
for(var first={pos:this.NO_POS},firstShort,up,R=[],i=0,n=spos.length,it;i<n&&(it=spos[i]).l>=len;i++){
var bpos=it.pos,epos=bpos+len-1,b=this.isInRule(bpos),e=this.isInRule(epos);
if(b.rule===this.NO_RULE&&e.rule===this.NO_RULE){
if(up)R.push(b);
else if(first.pos===this.NO_POS)
first=b,firstShort=it;
else this.rulePossible(first,b,len)&&R.push(first,up=b)
}else if(b.rule!==this.NO_RULE&&e.rule!==this.NO_RULE){
b=this.findInRulePosition(bpos);
e=this.findInRulePosition(epos);
if(b.rule===e.rule&&e.pos===b.pos+len-1)
if(up)R.push(b);
else if(first.pos==this.NO_POS)
first=b,firstShort=it;
else this.rulePossible(first,b,len)&&R.push(first,up=b)
}
}
up&&this.makeReplacements(R,len);
if(i==n)return;
for(R.length=b=0;i<n;)R[b++]=spos[i++];
if(first.pos!==this.NO_POS)
R.unshift(firstShort),this.shortPos[R[1].l].push(R);
else b>1&&this.shortPos[R[1].l].push(R)
},
//convert whole string(top level rule)to rule fragments
stringToCfgRule:function(){
for(var R=this.rules,S,T=this.data,cfgRule=[],c=0,i=0,l,n=0,e=this.N;i<e;i+=l,cfgRule[c++]=S)
if(T[i]!==this.UNREPLACED)l=R[S=T[i]].length(n++);
else{
for(l=0;i+l++<e&&T[i+l]===this.UNREPLACED;);
S=this.str.slice(i,i+l),n+=l
}this.n=n;
return cfgRule
},
//convert one rule to fragments
ruleToCfgRule:function(rule){
var R=this.rules,S,T=this.data,cfgRule=[],b=R[rule].begin,c=0,i=b,l=R[rule].p,e=R[rule].end;
if(l!==this.NO_PREFIX_RULE)i+=R[cfgRule[c++]=l].length();
for(;i<=e;i+=l,cfgRule[c++]=S)
if(i===b||T[i]<1&&-T[i]===b){
for(l=0;i+l++<e&&T[i+l]<1&&-T[i+l]===b;);
S=this.str.slice(i,i+l)
}else l=R[S=T[i]].length();
return cfgRule
},
//create suffix array and lcp interval tree
createSuffixStructures:function(cs,gs){
cs&=-1;if(!cs)cs=255;
var LCP=createLCPArray(this.str,sais(this.SA=new Int32Array(this.N),this.str,this.N,++cs),0,this.N);
this.lcpTree=LcpTreeCreator.createLcpTree(LCP,this.N);
if(gs)this.treeStats=LcpTreeCreator.getStats(this.lcpTree)
},
//allocate and initialize structures for rule related structures
initRuleStructures:function(){
this.data=new Int32Array(this.N).fill(this.UNREPLACED);
this.numRules=1;
this.rules[this.rules.length=0]=new Rule(0,0,0)
},
//traverse lcp interval tree and store intervals in descending order
initDescendingLcp:function(){
for(var i=-1,l,m=i,n=this.lcpTree.length,N=this.lcpTree,D=this.descIntervals=[];++i<n;l>m&&(m=l))l=N[i].lcp;
this.maxLcp=m;
var P=new Int32Array(m+1),I=this.numIntervals=P.slice();
for(;i--;)I[N[i].lcp]++;
for(;i<m;)if(++i>1&&(l=I[i]))D[i]=new Int32Array(l);
for(i=-1;++i<n;l>1&&(D[l][P[l]++]=i))l=N[i].lcp;
this.shortPos=I=[];this.localShortPos=N=[];
for(;I[m]=[],N[m]=[],m--;);
},
formRules:function(){
for(var i,d,l=this.maxLcp,p,I=this.numIntervals,D=this.descIntervals,P=this.shortPos;l>1;--l){
if(I[l]>0)
for(i=0,d=D[l];i<I[l];)
this.processInterval(d[i++]);
for(p=P[l];p.length;)
this.processShortened(p.shift(),l)
}
},
deleteSuffixStructures:function(){this.lcpTree.length=0;delete this.SA},
//create(explicit)context free grammar from internal representation
createGrammar:function(){
for(var i=0,l=this.numRules,g=[this.stringToCfgRule()];++i<l;)g[i]=this.ruleToCfgRule(i);
return g
},
//free memory of rule related structures
freeRuleStructures:function(){
delete this.data;this.rules.length=0
},
//free descIntervals data
freeDescendingLcp:function(){
for(var i=this.maxLcp+1;i;)delete this.descIntervals[--i];
delete this.descIntervals;
delete this.numIntervals;
delete this.shortPos;
delete this.localShortPos
},
// return CFG
compress:function(gs,live){
this.createSuffixStructures(this.cs,gs);
this.initRuleStructures();
this.initDescendingLcp();
this.formRules();
this.deleteSuffixStructures();
var g=this.createGrammar();
live||this.freeRuleStructures(this.freeDescendingLcp());
return g
},
getSubstring:function(pos,i){
for(var s=[];i;)s[--i]=this.str[pos+i];
return String.fromCharCode.apply(0,s)
},
printStructure:function(){
console.log(this.str.join(" "));
for(var i=0,s,t=[];i<this.N;t[i++]=s===this.UNREPLACED?"U":s)s=this.data[i];
console.log(t.join(" "))
},
printRules:function(){
for(var i=0,j,p,l=this.numRules,n,t=[],R=this.rules,S=this.str;t.length=++i<l;console.log("rule",i,":",String.fromCharCode.apply(0,t)))
for(j=R[i].begin,n=R[i].end,p=0;j<=n;)t[p++]=S[j++];
},
printStats:function(){
console.log("string_size:",this.N,"interval_tree_size:",this.treeStats.size,"sum_of_tree_depths:",this.treeStats.p,"alpha:",this.treeStats.p/this.N);
}};
function comp(A){
var z=A.length,C=new LongestFirstSaCompressor(A,z),cfg=C.compress();
console.log(gram2str(cfg))
}
主要関数
-
LongestFirstSaCompressor(s,l,c)
-
s
文脈自由文法の元となる文字列…と言うか数値配列
-
l
s
の長さ -
c
s
に含まれる最大値
-
-
LongestFirstSaCompressor.prototype.compress(gs,live)
-
gs
LcpTreeの状態を保存するかどうかの真偽値。debug用(printRules())
-
live
member変数を生かしておくかどうかの真偽値。通常は偽でmemory解放。debug用(printRules()、printStructure())
-
return
構築された文脈自由文法(配列)
- [0]
文法の最終形態
- [1...length-1]
生成規則
- 要素がNumber型の場合: 番号が指す生成規則
- 要素が配列の場合: 1個以上の終端記号の塊
- [0]
-
-
LongestFirstSaCompressor.prototype.printRules()
生成規則をconsole表示 -
LongestFirstSaCompressor.prototype.printStats()
LcpTreeの状態をconsole表示 -
LongestFirstSaCompressor.prototype.printStructure()
文脈自由文法の元配列がどうなったかをconsole表示
var A=Array.from("That that is is that that is not is not is that it it is",a=>a.charCodeAt());
var C=new LongestFirstSaCompressor(A,A.length), cfg=C.compress();
console.log(gram2str(cfg))
動作検証
See the Pen Context Free Grammar by ESA by xezz (@xezz) on CodePen.
性能
巨大文字列には不向きという印象。高速でもないっぽい。圧縮に応用してみるとRePair
には負けるかな
圧縮編
単純なBit Coder
とRange Coder
で実装したものを晒しておきます(設計はゴミ)。
See the Pen Grammar Compression by ESA by xezz (@xezz) on CodePen.