/** 文字分词 隐马尔可夫模型 共4种状态S B M E AMap 为状态转移概率矩阵 4*4,表示从{S B M E}到{S B M E}的概率 BMap 为当前字属于某种状态{S B M E}的概率 * */ //有限状态 const S=['S','B','Mn','E'] const mekflink={ empty:{S:1/16}, AMap:{ 'S-S':1000, 'S-B':1000, 'E-B':1000, 'E-S':1000, }, BMap:{}, AMapGl:{}, BMapGl:{}, add(text){ if(text.length>1){ for(let i=0;i<text.length;i++){ if(i===0){ this.push(text[i],'B') }else if(i===text.length-1){ this.push(text[i],'E') if(text.length>2){ this.pushState('M'+(i-1),'E') }else{ this.pushState('B','E') } }else if(i===1){ this.push(text[i],'M'+i) this.pushState('B','M'+i) }else{ this.push(text[i],'M'+i) this.pushState('M'+(i-1),'M'+i) } } }else{ this.push(text,'S') } }, pushState(t0N,t1N){ const AMap=this.AMap; const key=t0N+'-'+t1N if(!AMap[key]){ AMap[key]=0 } AMap[key]++; }, push(key,state){ const BMap=this.BMap if(!BMap[key]){ BMap[key]={} } if(!BMap[key][state]){ BMap[key][state]=0 } BMap[key][state]++; }, //生成模型 makeGl() { const AMap=this.AMap; const BMap=this.BMap; const AMapGl=this.AMapGl; const BMapGl=this.BMapGl; //统计A const AMapT={} for(let key in AMap){ const [t0,t1]=key.split('-') if(!AMapT[t0]){ AMapT[t0]=0; } AMapT[t0]=AMapT[t0]+AMap[key]; } for(let key in AMap){ const [t0,t1]=key.split('-') AMapGl[key]=this.chu(AMap[key],AMapT[t0]) } //统计B for(let key in BMap){ let t=0; for(let k in BMap[key]){ t=t+BMap[key][k] } const obj=Object.create(this.empty) for(let k in BMap[key]){ obj[k]=this.chu(BMap[key][k],t) } BMapGl[key]=obj; } return { AMapGl,BMapGl } }, chu(p1,p2){ return p1/p2; }, exex(p1,p2){ return p1*p2; }, isBig(p1,p2){ return p1>p2 }, getT1Arr(t0Obj,BObj){ const AMapGl=this.AMapGl; const t1Obj={}; let glAll=0; for(let t0 in t0Obj){ const link=t0Obj[t0] for(let k in AMapGl){ const arr=k.split('-') if(t0===arr[0]&&BObj[arr[1]]){ const gl=this.exex(link.gl,this.exex(AMapGl[k],BObj[arr[1]])) if(gl>0){ if(!t1Obj[arr[1]]){ glAll=glAll+gl; t1Obj[arr[1]]={ gl:gl, data:link.data+'-'+arr[1] } }else if(this.isBig(gl,t1Obj[arr[1]].gl)){ glAll=glAll+gl-t1Obj[arr[1]].gl; t1Obj[arr[1]]={ gl:gl, data:link.data+'-'+arr[1] } } } } } } for(let k in t1Obj){ const gl=parseInt(t1Obj[k].gl/glAll*100); t1Obj[k].gl=gl if(gl===0){ delete t1Obj[k] } } return t1Obj; }, solve(text){ const AMapGl=this.AMapGl; const BMapGl=this.BMapGl; console.log('状态转移概率',AMapGl) console.log('特征统计概率',BMapGl) //马尔可夫链条 //获取当前状态可能的下一个状态 let t0Obj={ 'S':{ gl:1,data:'S' }, 'B':{ gl:1,data:'B' } } for(let i=1;i<text.length;i++){ t0Obj=this.getT1Arr(t0Obj,BMapGl[text[i]]||Object.create(this.empty)) } const cache={} for(let k in t0Obj){ const dstr=t0Obj[k].data.replace(/[d-]/g,'') const data=[] let start,end; for(let i=0;i<text.length;i++){ if(dstr[i]==='B'){ start=i; }else if(dstr[i]==='E'){ end=i; data.push(start,end); } } const key=data.join(',') if(typeof cache[key]!=='undefined'){ cache[key].gl=cache[key].gl+t0Obj[k].gl; }else{ cache[key]={ gl:t0Obj[k].gl, data:data }; } } const lArr=[] for(let k in cache){ lArr.push(cache[k]) } if(lArr.length>0){ lArr.sort(function (p1,p2) { return p2.gl-p1.gl; }) } return lArr; } } module.exports=mekflink; //demo const arrH=['1ec66668876666666'] arrH.forEach(function (text) { mekflink.add(text) }) mekflink.makeGl() const text='1ec666688766666661ec66668876666666211ec6666887666666621'; const arr=mekflink.solve(text); const aArr=[] arr.forEach(function (item) { const tArr=[] for(let i=0;i<item.data.length;i=i+2){ const t=text.substring(item.data[i],item.data[i+1]+1) tArr.push(arrH.indexOf(t)) } aArr.push(tArr) }) console.log(aArr)