/**
文字分词 隐马尔可夫模型
共4种状态S B M E
AMap 为状态转移概率矩阵 4*4,表示从{S B M E}到{S B M E}的概率
BMap 为当前字属于某种状态{S B M E}的概率
* */
//有限状态
const S=['S','B','Mn','E']
const mekflink={
empty:{S:1/16},
AMap:{
'S-S':1000,
'S-B':1000,
'E-B':1000,
'E-S':1000,
},
BMap:{},
AMapGl:{},
BMapGl:{},
add(text){
if(text.length>1){
for(let i=0;i<text.length;i++){
if(i===0){
this.push(text[i],'B')
}else if(i===text.length-1){
this.push(text[i],'E')
if(text.length>2){
this.pushState('M'+(i-1),'E')
}else{
this.pushState('B','E')
}
}else if(i===1){
this.push(text[i],'M'+i)
this.pushState('B','M'+i)
}else{
this.push(text[i],'M'+i)
this.pushState('M'+(i-1),'M'+i)
}
}
}else{
this.push(text,'S')
}
},
pushState(t0N,t1N){
const AMap=this.AMap;
const key=t0N+'-'+t1N
if(!AMap[key]){
AMap[key]=0
}
AMap[key]++;
},
push(key,state){
const BMap=this.BMap
if(!BMap[key]){
BMap[key]={}
}
if(!BMap[key][state]){
BMap[key][state]=0
}
BMap[key][state]++;
},
//生成模型
makeGl() {
const AMap=this.AMap;
const BMap=this.BMap;
const AMapGl=this.AMapGl;
const BMapGl=this.BMapGl;
//统计A
const AMapT={}
for(let key in AMap){
const [t0,t1]=key.split('-')
if(!AMapT[t0]){
AMapT[t0]=0;
}
AMapT[t0]=AMapT[t0]+AMap[key];
}
for(let key in AMap){
const [t0,t1]=key.split('-')
AMapGl[key]=this.chu(AMap[key],AMapT[t0])
}
//统计B
for(let key in BMap){
let t=0;
for(let k in BMap[key]){
t=t+BMap[key][k]
}
const obj=Object.create(this.empty)
for(let k in BMap[key]){
obj[k]=this.chu(BMap[key][k],t)
}
BMapGl[key]=obj;
}
return {
AMapGl,BMapGl
}
},
chu(p1,p2){
return p1/p2;
},
exex(p1,p2){
return p1*p2;
},
isBig(p1,p2){
return p1>p2
},
getT1Arr(t0Obj,BObj){
const AMapGl=this.AMapGl;
const t1Obj={};
let glAll=0;
for(let t0 in t0Obj){
const link=t0Obj[t0]
for(let k in AMapGl){
const arr=k.split('-')
if(t0===arr[0]&&BObj[arr[1]]){
const gl=this.exex(link.gl,this.exex(AMapGl[k],BObj[arr[1]]))
if(gl>0){
if(!t1Obj[arr[1]]){
glAll=glAll+gl;
t1Obj[arr[1]]={
gl:gl,
data:link.data+'-'+arr[1]
}
}else if(this.isBig(gl,t1Obj[arr[1]].gl)){
glAll=glAll+gl-t1Obj[arr[1]].gl;
t1Obj[arr[1]]={
gl:gl,
data:link.data+'-'+arr[1]
}
}
}
}
}
}
for(let k in t1Obj){
const gl=parseInt(t1Obj[k].gl/glAll*100);
t1Obj[k].gl=gl
if(gl===0){
delete t1Obj[k]
}
}
return t1Obj;
},
solve(text){
const AMapGl=this.AMapGl;
const BMapGl=this.BMapGl;
console.log('状态转移概率',AMapGl)
console.log('特征统计概率',BMapGl)
//马尔可夫链条
//获取当前状态可能的下一个状态
let t0Obj={
'S':{
gl:1,data:'S'
},
'B':{
gl:1,data:'B'
}
}
for(let i=1;i<text.length;i++){
t0Obj=this.getT1Arr(t0Obj,BMapGl[text[i]]||Object.create(this.empty))
}
const cache={}
for(let k in t0Obj){
const dstr=t0Obj[k].data.replace(/[d-]/g,'')
const data=[]
let start,end;
for(let i=0;i<text.length;i++){
if(dstr[i]==='B'){
start=i;
}else if(dstr[i]==='E'){
end=i;
data.push(start,end);
}
}
const key=data.join(',')
if(typeof cache[key]!=='undefined'){
cache[key].gl=cache[key].gl+t0Obj[k].gl;
}else{
cache[key]={
gl:t0Obj[k].gl,
data:data
};
}
}
const lArr=[]
for(let k in cache){
lArr.push(cache[k])
}
if(lArr.length>0){
lArr.sort(function (p1,p2) {
return p2.gl-p1.gl;
})
}
return lArr;
}
}
module.exports=mekflink;
//demo
const arrH=['1ec66668876666666']
arrH.forEach(function (text) {
mekflink.add(text)
})
mekflink.makeGl()
const text='1ec666688766666661ec66668876666666211ec6666887666666621';
const arr=mekflink.solve(text);
const aArr=[]
arr.forEach(function (item) {
const tArr=[]
for(let i=0;i<item.data.length;i=i+2){
const t=text.substring(item.data[i],item.data[i+1]+1)
tArr.push(arrH.indexOf(t))
}
aArr.push(tArr)
})
console.log(aArr)