zoukankan      html  css  js  c++  java
  • pyscws4 是一个python的分词程序

    pyscws4 是一个python的分词程序 | mei year-美叶 专注思想。

         pyscws4 是一个python的分词程序
        Posted on 2012 年 11 月 15 日 by dingyangfan   

        注意:pyscws4 是一个python的分词程序,抄袭至:马明练开发的php版的pscws4 地址是:http://www.ftphp.com/scws/ 。


        1. pscws4.php





        view source
        001    #coding=gbk
        002    from __future__ import division
        003    from collections import OrderedDict
        004    from xdb_r import XDB_R
        005    import math ,struct,copy
        006    import sys,time
        007    reload(sys)
        008    sys.setdefaultencoding('gbk')
        009    ''' defines for ruleset '''
        010    PSCWS4_RULE_MAX     = 31    # just 31, PHP do not support unsigined Int
        011    PSCWS4_RULE_SPECIAL=    0x80000000
        012    PSCWS4_RULE_NOSTATS=    0x40000000
        013    PSCWS4_ZRULE_NONE=  0x00
        014    PSCWS4_ZRULE_PREFIX=    0x01
        015    PSCWS4_ZRULE_SUFFIX=    0x02
        016    PSCWS4_ZRULE_INCLUDE=   0x04    # with include
        017    PSCWS4_ZRULE_EXCLUDE=   0x08    # with exclude
        018    PSCWS4_ZRULE_RANGE =    0x10    # with znum range
        020    ''' defines for mode of scws <= 0x800 '''
        021    PSCWS4_IGN_SYMBOL=  0x01
        022    PSCWS4_DEBUG=           0x02
        023    PSCWS4_DUALITY=     0x04
        025    ''' multi segment policy >= 0x1000 '''
        026    PSCWS4_MULTI_NONE=    0x0000        # nothing
        027    PSCWS4_MULTI_SHORT= 0x1000      # split long words to short words from left to right
        028    PSCWS4_MULTI_DUALITY=   0x2000      # split every long words(3 chars?) to two chars
        029    PSCWS4_MULTI_ZMAIN=   0x4000        # split to main single chinese char atr = j¦a¦n?¦v?
        030    PSCWS4_MULTI_ZALL=  0x8000      # attr = ** , all split to single chars
        031    PSCWS4_MULTI_MASK=  0xf000      # mask check for multi set
        032    PSCWS4_ZIS_USED=        0x8000000
        034    ''' single bytes segment flag (纯单字节字符) '''
        035    PSCWS4_PFLAG_WITH_MB=   0x01
        036    PSCWS4_PFLAG_ALNUM= 0x02
        037    PSCWS4_PFLAG_VALID= 0x04
        038    PSCWS4_PFLAG_DIGIT= 0x08
        039    PSCWS4_PFLAG_ADDSYM=    0x10
        041    ''' constant var define '''
        042    PSCWS4_WORD_FULL=       0x01    # 多字: 整词
        043    PSCWS4_WORD_PART=       0x02    # 多字: 前词段
        044    PSCWS4_WORD_USED=       0x04    # 多字: 已使用
        045    PSCWS4_WORD_RULE=       0x08    # 多字: 自动识别的
        047    PSCWS4_ZFLAG_PUT=       0x02    # 单字: 已使用
        048    PSCWS4_ZFLAG_N2=        0x04    # 单字: 双字名词头
        049    PSCWS4_ZFLAG_NR2=       0x08    # 单字: 词头且为双字人名
        050    PSCWS4_ZFLAG_WHEAD= 0x10    # 单字: 词头
        051    PSCWS4_ZFLAG_WPART= 0x20    # 单字: 词尾或词中
        052    PSCWS4_ZFLAG_ENGLISH=   0x40    # 单字: 夹在中间的英文
        053    PSCWS4_ZFLAG_SYMBOL=    0x80    # 单字: 符号系列
        055    PSCWS4_MAX_EWLEN=       16
        056    PSCWS4_MAX_ZLEN=        128
        058    class PSCWS4(object):
        059        _xd = None  # xdb dict handler
        060        _rs = None      # ruleset resource
        061        _rd = None      # ruleset data
        062        _cs = ''    # charset
        063        _ztab = []      # zi len table
        064        _mode = 0   # scws mode
        065        _txt = None     # text string
        066        _res = None
        067        _zis = None     # z if used?(duality)
        068        _off = 0
        069        _len = 0
        070        _wend = 0
        071        _wmap = []
        072        _zmap = []
        073        i = 0
        075        def __init__(self,charset='gbk'):
        076            self._xd = False
        077            self._rs = self._rd = OrderedDict()
        078            self.set_charset(charset)
        079        def __del__(self):
        080            self.close()
        081        def debug(self):
        082            print "off:{0} len(_res):{1} len(_wmap):{2}\
        083     len(_zmap):{3} _wend:{4} _zis:{5}\
        084     len(_rs):{6} len(_rd):{7}\
        085            ".format(\
        086            self._off,len(self._res),len(self._wmap),len(self._zmap),self._wend,self._zis,\
        087            len(self._rs),len(self._rd)
        088            )
        089        #设置字符集(ztab)
        090        def set_charset(self,charset='gbk'):
        091            charset = charset.strip().lower()
        092            if(charset != self._cs):
        093                self._cs = charset
        094                self._ztab = [1 for i in range(0,0x81)]
        095                if(charset == 'utf-8' or charset == 'utf8'):
        096                    self._ztab.extend([1 for i in range(0x81,0xc0)])
        097                    self._ztab.extend([2 for i in range(0xc0,0xe0)])
        098                    self._ztab.extend([3 for i in range(0xe0,0xf0)])
        099                    self._ztab.extend([4 for i in range(0xf0,0xf8)])
        100                    self._ztab.extend([5 for i in range(0xf8,0xfc)])
        101                    self._ztab.extend([6 for i in range(0xfc,0xfe)])
        102                    self._ztab.extend([1])
        103                else:
        104                    self._ztab.extend([2 for i in range(0x81,0xff)])
        105                self._ztab.extend([1])
        106                #print len(self._ztab)
        107            # 设置词典
        108        def set_dict(self,fpath,mem=False):
        109            xdb = XDB_R(mem)
        110            if(xdb.Open(fpath) is not True): return False
        111            self._xd = xdb
        112        #设置规则集
        113        def set_rule(self,fpath):
        114            self._rule_load(fpath)
        115        #设置忽略符号与无用字符
        116        def set_igonre(self,yes):
        117            if(yes is True):self._mode ¦= PSCWS4_IGN_SYMBOL
        118            else: self._mode &= ~PSCWS4_IGN_SYMBOL
        119        #设置复合分词等级 ($level = 0,15)
        120        def set_multi(self,level):
        121            level = (int(level) << 12)
        122            self._mode &= ~PSCWS4_MULTI_MASK
        123            if(level & PSCWS4_MULTI_MASK): self._mode ¦= level
        124        #设置是否显示分词调试信息
        125        def set_debug(self,yes):
        126            if(yes is True): self._mode ¦= PSCWS4_DEBUG
        127            else:self._mode &= ~PSCWS4_DEBUG
        128        #设置是否自动将散字二元化
        129        def set_duality(self,yes):
        130            if(yes is True): self._mode ¦= PSCWS4_DUALITY
        131            else:self._mode &= ~PSCWS4_DUALITY
        132        # 设置要分词的文本字符串
        133        def send_text(self,text):
        134            self._txt = str(text)
        135            self._len = len(self._txt)
        136            self._off =0
        137        # 取回一批分词结果(需要多次调用, 直到返回 false)
        138        def get_result(self):
        139            off = self._off
        140            tlen = self._len
        141            txt = self._txt
        142            self._res = []
        144            while  ((off < tlen) and (ord(txt[off])<=0x20)):
        145                if(txt[off] == "\r" or txt[off] == "\n"):
        146                    self._off = off +1
        147                    self._put_res(off,0,1,'un')
        148                    return self._res
        149                off +=1
        150            if(off >= tlen): return False
        151            self._off = off
        152            ch = txt[off]
        153            cx = ord(ch)
        154            if(self._char_token(ch)):
        155                self._off +=1
        156                self._put_res(off,0,1,'un')
        157                return self._res
        158            clen = self._ztab[cx]
        160            zlen = 1
        161            pflag = (PSCWS4_PFLAG_WITH_MB if clen >1 else (PSCWS4_PFLAG_ALNUM if self._is_alnum(cx) else 0))
        162            off = (off + clen)
        163            while off < tlen:
        164                ch = txt[off]
        165                cx = ord(ch)
        166                if (cx <= 0x20 or self._char_token(ch)):break
        167                clen = self._ztab[cx]
        168                if(not (pflag & PSCWS4_PFLAG_WITH_MB)):
        169                    if(clen ==1):
        170                        if((pflag & PSCWS4_PFLAG_ALNUM) and not self._is_alnum(cx)):
        171                            pflag ^= PSCWS4_PFLAG_ALNUM
        172                    else:
        173                        if(not ((pflag & PSCWS4_PFLAG_ALNUM) ) or zlen > 2): break
        174                        pflag  ¦= PSCWS4_PFLAG_WITH_MB
        175                elif ( ((pflag & PSCWS4_PFLAG_WITH_MB) ) and clen ==1):
        176                    #mb + single-byte. allowd: alpha+num + 中文
        177                    if(not self._is_alnum(cx)): break
        178                    pflag &= ~PSCWS4_PFLAG_VALID
        179                    i = off+1
        180                    while i<(off+3):
        181                        ch = txt[i]
        182                        cx = ord(ch)
        183                        if( (i >= tlen) or (cx <=0x20) or (self._ztab[cx] > 1)):
        184                            pflag ¦= PSCWS4_PFLAG_VALID
        185                            break
        186                        if(not self._is_alnum(cx)): break
        187                        i+=1
        188                    if( not(pflag & PSCWS4_PFLAG_VALID) ): break
        189                    clen += (i - off -1)
        190                #add max zlen limit
        191                zlen +=1
        192                if(zlen >=PSCWS4_MAX_ZLEN):break
        193                off = (off + clen)
        195            #处理半个字的问题
        196            ch =off
        197            if (ch > tlen):
        198                off -= clen
        199            #do the real segment
        200            if(off <= self._off):
        201                return False
        202            elif ( pflag & PSCWS4_PFLAG_WITH_MB ):
        203                self._msegment(off,zlen)
        204            elif ( not(pflag & PSCWS4_PFLAG_ALNUM)  or ((off - self._off) >=PSCWS4_MAX_EWLEN ) ):
        205                self._ssegment(off)
        206            else:
        207                zlen = off -self._off
        208                self._put_res(self._off,2.5*math.log(zlen),zlen,'en')
        209            self._off = (tlen if ch > tlen else off)
        210            if(len(self._res) == 0): return self.get_result()
        211            return self._res
        212        def get_tops(self,limit = 10,xattr = ''):
        213            ret = {}
        214            if(self._txt is None): return False
        215            xmode = False
        216            attrs = {}
        217            if(xattr != ''):
        218                if(xattr[0:1] == '~'):
        219                    xattr = xattr[1:]
        220                    xmode = 1
        221                for tmp in xattr.split(','):
        222                    tmp = tmp.strip().lower()
        223                    if( tmp != ''): attrs[tmp] = True
        224            off = self._off
        225            self._off = cnt = 0
        226            tlist = {}
        227            while 1:
        228                tmpa = self.get_result()
        229                if (not tmpa): break
        230                for tmp in tmpa:
        231                    #有改
        232                    if(tmp['idf'] < 0.2 or tmp['attr'][0:1] == '#'): continue
        233                    if(len(attrs) >0):
        234                        if(xmode == True and not attrs.has_key(tmp['attr'])): continue
        235                        if(xmode == False and attrs.has_key(tmp['attr'])): continue
        236                    word = tmp['word'].lower()
        237                    if(self._rule_checkbit(word,PSCWS4_RULE_NOSTATS)): continue
        238                    if(tlist.has_key(word)):
        239                        tlist[word]['weight'] += tmp['idf']
        240                        tlist[word]['times'] +=1
        241                    else:
        242                        tlist[word] = {'word':tmp['word'],'times':1,'weight':tmp['idf'],'attr':tmp['attr']}
        243            self._off = off
        244            t= sorted(tlist.values(),key=lambda d:d['weight'],cmp=lambda a,b: 1 if b > a else -1)
        245            return t[0:limit]
        246        def close(self):
        247            if(self._xd):
        248                self._xd.Close()
        249                self._xd = False
        250            self._rd = []
        251            self._rs = []
        252        def version(self):
        253            return 'pySCWS/1.0 - by donghongyi'
        254        def _rule_load(self,fpath):
        255            try:
        256                fd = file(fpath,'r')
        257            except IOError:
        258                return False
        259            i = j = 0
        260            self._rs = OrderedDict()
        261            while 1:
        262                buf = fd.readline()
        263                if not buf:
        264                    break
        265                if (buf[0:1] != '['): continue
        266                pos = buf.find(']')
        267                if(pos == -1 or pos ==1 or pos > 15):continue
        268                key = buf[1:pos].lower()
        269                if(self._rs.has_key(key)): continue
        270                item = {'tf':5.0, 'idf':3.5, 'attr':'un', 'bit':0, 'flag':0, 'zmin':0, 'zmax':0, 'inc':0, 'exc':0}
        271                if(key == 'special'):
        272                    item['bit'] = PSCWS4_RULE_SPECIAL
        273                elif (key == 'nostats'):
        274                    item['bit'] = PSCWS4_RULE_NOSTATS
        275                else:
        276                    item['bit'] = (1 << j)
        277                    j +=1
        278                self._rs[key] = item
        279                #这里可能是错误
        280                i +=1
        281                if(i >=PSCWS4_RULE_MAX): break
        282            #load the ruleset
        283            fd.seek(0)
        284            rbl = False
        285            item= {}
        286            while 1:
        287                buf = fd.readline()
        288                if not buf:
        289                    break
        290                ch = buf[0:1]
        291                if(ch == ';'): continue
        292                if(ch == '['):
        293                    item = {}
        294                    pos = buf.find(']')
        295                    if(pos > 1):
        296                        key = buf[1:pos].lower()
        297                        if(self._rs.has_key(key)):
        298                            rbl = True
        299                            item = self._rs[key]
        300                    continue
        301                if(ch == ':'):
        302                    buf = buf[1:]
        303                    pos = buf.find('=')
        304                    if(pos == -1):
        305                        continue
        306                    pkey,pval = buf.split('=',2)
        307                    pkey = pkey.strip()
        308                    pval = pval.strip()
        309                    if(pkey == 'line'):    rbl = False if pval[0:1].strip() == 'n' else True
        310                    elif (pkey =='tf'):    item['tf'] = float(pval)
        311                    elif (pkey =='idf'):    item['idf'] = float(pval)
        312                    elif (pkey =='attr'):    item['attr'] = pval
        313                    elif (pkey == 'znum'):
        314                        pos = pval.find(',')
        315                        if(pos > -1):
        316                            item['zmax'] = int(pval[pos+1:].strip())
        317                            item['flag'] ¦= PSCWS4_ZRULE_RANGE
        318                            pval = pval[0:pos]
        319                        item['zmin'] = int(pval)
        320                    elif (pkey == 'type'):
        321                        if(pval == 'prefix'):
        322                            item['flag'] ¦= PSCWS4_ZRULE_PREFIX
        323                        if(pval == 'suffix'):
        324                            item['flag'] ¦= PSCWS4_ZRULE_SUFFIX
        325                    elif (pkey == 'include' or pkey =='exclude'):
        326                        clude = 0
        327                        for tmp in pval.split(','):
        328                            tmp = tmp.strip().lower()
        329                            if(not self._rs.has_key(tmp)): continue
        330                            clude ¦= self._rs[tmp]['bit']
        331                        if(pkey == 'include'):
        332                            item['inc'] ¦= clude
        333                            item['flag'] ¦= PSCWS4_ZRULE_INCLUDE
        334                        else:
        335                            item['exc'] ¦= clude
        336                            item['flag'] ¦=PSCWS4_ZRULE_EXCLUDE
        337                    continue
        338                if(item == {}): continue
        339                buf = buf.strip()
        340                if (buf == ''): continue
        341                if(rbl):
        342                    self._rd[buf] = item
        343                else:
        344                    tlen = len(buf)
        345                    off =0
        346                    while off < tlen:
        347                        tord = ord(buf[off:off+1])
        348                        zlen = self._ztab[tord]
        349                        if( off + zlen >= tlen): break
        350                        zch = buf[off:off+zlen]
        351                        self._rd[zch] = item
        352                        off += zlen
        353        #get the ruleset
        354        def _rule_get(self,str):
        355            if(not self._rd.has_key(str)): return False
        356            return self._rd[str]
        357        #check the bit with str
        358        def _rule_checkbit(self,str,bit):
        359            if(not self._rd.has_key(str)): return False
        360            bit2 = self._rd[str]['bit']
        361            return (True if (bit & bit2) else False)
        362        #check the rule include ¦ exclude
        363        def _rule_check(self,rule,str):
        364            if( (rule['flag'] & PSCWS4_ZRULE_INCLUDE) and not self._rule_checkbit(str,rule['bit'])): return  False
        365            if( (rule['flag'] & PSCWS4_ZRULE_EXCLUDE) and self._rule_checkbit(str,rule['bit'])): return False
        366            return True
        367        #bulid res
        368        def _put_res(self,o,i,l,a):
        369            word = self._txt[o:o+l]
        370            item = {'word':word,'off':o,'idf':i,'len':l,'attr':a}
        371            self._res.append(item)
        372        #alpha, numeric check by ORD value
        373        def _is_alnum(self,c):
        374            return ((c>=48 and c<=57) or (c>=65 and c<=90) or (c>=97 and c<=122))
        375        def _is_alpha(self,c):
        376            return ((c>=65 and c<=90) or ( c>=97 and c<=122))
        377        def _is_ualpha(self,c):
        378            return (c>=65 and c<=90)
        379        def _is_digit(self,c):
        380            return (c>=48 and c<=57)
        381        def _no_rule1(self,f):
        382            return ((f & (PSCWS4_ZFLAG_SYMBOL¦PSCWS4_ZFLAG_ENGLISH)) or ((f & (PSCWS4_ZFLAG_WHEAD¦PSCWS4_ZFLAG_NR2)) == PSCWS4_ZFLAG_WHEAD))
        383        def _no_rule2(self,f):
        384            return self._no_rule1(f)
        385        def _char_token(self,c):
        386            return (c=='('or c==')'or c=='['or c==']'or c=='{'or c=='}'or c==':'or c=='"')
        387        # query the dict
        388        def _dict_query(self,word):
        389            if(not self._xd): return False
        390            value = self._xd.Get(word)
        391            if(not value): return False
        392            tmp = struct.unpack('f f B 3s',value)
        393            return {'tf':tmp[0],'idf':tmp[1],'flag':tmp[2],'attr':tmp[3].rstrip(b'\x00')}
        394        #ssegment, 单字节用语切割
        395        def _ssegment(self,end):
        396            start = self._off
        397            wlen = end - start
        398            #check special words (need strtoupper)
        399            if(wlen > 1):
        400                #可能出错
        401                txt = self._txt[start:start+wlen].lower()
        402                if(self._rule_checkbit(txt,PSCWS4_RULE_SPECIAL)):
        403                    self._put_res(start,9.5,wlen,'nz')
        404                    return
        405            txt = self._txt
        406            #check brief words such as S.H.E M.R.
        407            if( self._is_ualpha(ord(txt[start])) and txt[start+1] == '.'):
        408                #修改
        409                ch = start +2
        410                while ch< end:
        411                    if(not self._is_alpha(ord(txt[ch]))): break
        412                    ch +=1
        413                    if(ch == end or txt[ch] != '.'): break
        414                    ch +=1
        415                if(ch == end):
        416                    self._put_res(start,7.5,wlen,'nz')
        417                    return
        418            #取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的'
        419            #print 1111
        420            while start < end:
        421                #修改过的
        422                ch = txt[start]
        423                start +=1
        424                cx = ord(ch)
        425                if(self._is_alnum(cx)):
        426                    pflag =PSCWS4_PFLAG_DIGIT if self._is_digit(cx) else 0
        427                    wlen = 1
        428                    while start < end:
        429                        ch = txt[start]
        430                        cx = ord(ch)
        431                        if(pflag & PSCWS4_PFLAG_DIGIT):
        432                            if(not self._is_digit(cx)):
        433                                if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x2e or not self._is_digit(ord(txt[start+1]))):
        434                                    break
        435                                pflag ¦= PSCWS4_PFLAG_ADDSYM
        436                        else:
        437                            if(not self._is_alpha(cx)):
        438                                if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x27 or not self._is_alpha(ord(txt[start+1]))):
        439                                    break
        440                                pflag ¦= PSCWS4_PFLAG_ADDSYM
        441                        start +=1
        442                        #可能出错
        443                        wlen +=1
        444                        if(wlen >=PSCWS4_MAX_EWLEN): break
        445                    self._put_res(start-wlen,2.5*math.log(wlen),wlen,'en')
        446                elif (not(self._mode & PSCWS4_IGN_SYMBOL)):
        447                    self._put_res(start-1,0,1,'un')
        448        #get one z by ZMAP
        449        def _get_zs(self,i,j = -1):
        450            if(j == -1): j = i
        451            return self._txt[self._zmap[i]['start']:self._zmap[i]['start']+( self._zmap[j]['end'] - self._zmap[i]['start'])]
        452        #mget_word
        453        def _mget_word(self,i,j):
        454            wmap = self._wmap
        455            if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)): return i
        456            r = i
        457            #观察
        458            #k=i+1
        459            for k in range(i+1,j+1):
        460                #while k<=j:
        461                if(wmap[i][k] and wmap[i][k]['flag'] & PSCWS4_WORD_FULL): r =k
        462                #k+=1
        463            return r
        464        #mset_word
        465        def _mset_word(self,i,j):
        466                wmap = self._wmap
        467                zmap = self._zmap
        468                item = wmap[i][j]
        469                if( (item is False) or (( self._mode & PSCWS4_IGN_SYMBOL)\
        470                    and not (item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un' )\
        471                    ):
        472                    return
        473                #散字自动二元聚合
        474                if(self._mode & PSCWS4_DUALITY):
        475                    k = self._zis
        476                    if(i == j and not(item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un'):
        477                        self._zis = i
        478                        if(k < 0): return
        479                        i = (k & ~PSCWS4_ZIS_USED)
        480                        if( (i != (j-1)) or (not (k & PSCWS4_ZIS_USED) and self._wend == i)):
        481                            self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
        482                            if( i != (j -1)): return
        483                        self._zis ¦= PSCWS4_ZIS_USED
        484                    else:
        485                        if( (k >=0) and (not (k & PSCWS4_ZIS_USED) or ( j > i))):
        486                            k &= ~PSCWS4_ZIS_USED
        487                            self._put_res(zmap[k]['start'], wmap[k][k]['idf'], zmap[k]['end'] - zmap[k]['start'], wmap[k][k]['attr'])
        488                        if( j > i): self._wend = j + 1
        489                        self._zis = -1
        490                #save the res
        491                self._put_res(zmap[i]['start'], item['idf'], zmap[j]['end'] - zmap[i]['start'], item['attr'])
        492                if( (j -i) > 1):
        493                    m = i
        494                    if ( self._mode & PSCWS4_MULTI_SHORT):
        495                        while (m < j):
        496                            k = m
        497                            n = m+1
        498                            while n<=j:
        499                                if(n ==j and m ==i): break
        500                                item = wmap[m][n]
        501                                if(item and item['flag'] & PSCWS4_WORD_FULL):
        502                                    k = n
        503                                    self._put_res(zmap[m]['start'], item['idf'], zmap[n]['end'] - zmap[m]['start'], item['attr'])
        504                                    if (not (item['flag'] & PSCWS4_WORD_PART)): break
        505                                n +=1
        506                            if (k == m):
        507                                if (m == i): break
        508                                item = wmap[m][m]
        510                                self._put_res(zmap[m]['start'], item['idf'], zmap[m]['end'] - zmap[m]['start'], item['attr'])
        511                            m = k+1
        512                            if(m == j):
        513                                m -=1
        514                                break
        515                    if( self._mode & PSCWS4_MULTI_DUALITY):
        516                        while m < j:
        517                            self._put_res(zmap[m]['start'], wmap[m][m]['idf'], zmap[m+1]['end'] - zmap[m]['start'], wmap[m][m]['attr'])
        518                            m +=1
        519                if( (j > i) and (self._mode & (PSCWS4_MULTI_ZMAIN¦PSCWS4_MULTI_ZALL))):
        520                    if( (j -i) == 1 and not wmap[i][j]):
        521                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): i +=1
        522                        else: wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_PUT
        523                        wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_PUT
        524                    #这里可能错误
        525                    while i <=j:
        526                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): continue
        527                        ssss = wmap[i][i]['attr'][0:1]
        528                        #print ssss
        529                        if( not (self._mode & PSCWS4_MULTI_ZALL) and not ( ssss[ssss.find('jnv'):])): continue
        530                        self._put_res(zmap[i]['start'], wmap[i][i]['idf'], zmap[i]['end'] - zmap[i]['start'], wmap[i][i]['attr'])
        531                        i +=1
        532        #mseg_zone
        533        def _mseg_zone(self,f,t):
        534            weight = nweight = 0.0
        535            wmap = self._wmap
        536            zmap = self._zmap
        537            mpath = npath = []
        538            x = f
        539            for i in range(f,t+1):
        540                j = self._mget_word(i,t)
        541                if ( j == i or j <=x or ( wmap[i][j]['flag'] & PSCWS4_WORD_USED)): continue
        542                #one word only
        543                if (i ==f and j ==t):
        544                    mpath = [(j-i),0xff]
        545                    break
        546                if( i !=f and (wmap[i][j]['flag'] & PSCWS4_WORD_RULE)): continue
        547                #create the new path
        548                wmap[i][j]['flag'] ¦= PSCWS4_WORD_USED
        549                nweight = wmap[i][j]['tf'] * (j-i+1)
        551                if (i ==f): nweight *=1.2
        552                elif (j ==t): nweight *=1.4
        553                if(npath == []):
        554                    npath = [0xff for uuu in range(t-f+2)]
        556                #lookfor backward
        557                x = 0
        558                m = f
        559                while m< i:
        560                    n = self._mget_word(m,i-1)
        561                    nweight *= wmap[m][n]['tf'] * (n-m+1)
        562                    npath[x] = n-m
        563                    x +=1
        564                    if(n>m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
        565                    m = n+1
        566                #my self
        567                npath[x] = j-i
        568                x+=1
        569                #lookfor forward
        570                m = j+1
        571                while m <=t:
        572                    n = self._mget_word(m,t)
        573                    nweight *= wmap[m][n]['tf'] * (n-m+1)
        574                    npath[x] = n-m
        575                    x +=1
        576                    if(n >m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
        577                    m = n+1
        578                npath[x] = 0xff
        579                nweight /= pow(x-1,4)
        580                #draw the path for debug
        581                if(self._mode & PSCWS4_DEBUG):
        582                    print "PATH by keyword = {0} (weight={1}):\n".format(self._get_zs(i,j),nweight)
        583                    m=f
        584                    x=0
        585                    n = npath[x]
        586                    while n !=0xff:
        587                        n +=m
        588                        print self._get_zs(m,n),' '
        589                        m = n+1
        590                        x+=1
        591                        n = npath[x]
        592                    print  "\n--\n"
        593                x = j
        594                #check better path
        595                if(nweight > weight):
        596                    weight = copy.deepcopy(nweight)
        597                    swap = copy.deepcopy(mpath)
        598                    mpath = copy.deepcopy(npath)
        599                    npath = copy.deepcopy(swap)
        600                    del swap
        601            #set the result, mpath != NULL
        602            if(mpath == []): return
        603            m = f
        604            x=0
        605            n = mpath[x]
        606            #print mpath
        607            while n !=0xff:
        608                n +=m
        609                #print m,n
        610                self._mset_word(m,n)
        611                m = n +1
        612                x+=1
        613                n = mpath[x]
        614        #msegment(重点函数)
        615        def _msegment(self,end,zlen):
        616            self._wmap = [[False for ooooo in range(zlen)] for i in range(zlen)]
        617            self._zmap = [False for ooooo in range(zlen)]
        618            wmap = self._wmap
        619            zmap = self._zmap
        620            txt = self._txt
        621            start = self._off
        622            self._zis = -1
        623            #load the zmap
        624            i =0
        625            #load the zmap
        626            while start < end:
        627                ch = txt[start]
        628                cx = ord(ch)
        629                clen = self._ztab[cx]
        630                if(clen == 1):
        631                    while start < end:
        632                        start +=1 #修改
        633                        cx = ord(txt[start])
        634                        if(self._ztab[cx] > 1): break
        635                        clen +=1
        636                    wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':PSCWS4_ZFLAG_ENGLISH, 'attr':'un'}
        637                else:
        638                    query = self._dict_query(txt[start:start+clen])
        639                    if(not query):
        640                        wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':0, 'attr':'un'}
        641                    else:
        642                        if(query['attr'][0:1] == '#'): query['flag'] ¦= PSCWS4_ZFLAG_SYMBOL
        643                        wmap[i][i] = query
        644                    start += clen
        645                zmap[i] = {'start':start-clen, 'end':start}
        646                i+=1
        648            #fixed real zlength
        649            zlen = i
        650            #create word query table
        651            for i in range(zlen):
        652                k=0
        653                j = i +1
        654                while j<zlen:
        655                    query = self._dict_query(self._get_zs(i,j))
        656                    if (not query):break
        657                    ch = query['flag']
        658                    if(ch & PSCWS4_WORD_FULL):
        659                        wmap[i][j] = query
        660                        wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
        661                        k = i+1
        662                        while k<=j:
        663                            wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WPART
        664                            k+=1
        665                    if (not (ch & PSCWS4_WORD_PART)): break
        666                    j +=1
        667                k-=1
        668                if(k and k>=0):
        669                    #set nr2 to some short name
        670                    if(k == (i+1)):
        671                        if(wmap[i][k]['attr'] == 'nr'):
        672                            wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_NR2
        673                    #clean the PART flag for the last word
        674                    if(k < j):
        675                        wmap[i][k]['flag'] ^= PSCWS4_WORD_PART
        676            # try to do the ruleset match
        677            # for name & zone & chinese numeric
        678            if(len(self._rd) > 0):
        679                #check for 'one word'
        680                for i in range(zlen):
        681                    if(self._no_rule1(wmap[i][i]['flag'])): continue
        682                    r1 = self._rule_get(self._get_zs(i))
        683                    if(not r1): continue
        684                    clen = r1['zmin'] if r1['zmin'] >0 else 1
        685                    if(( r1['flag'] & PSCWS4_ZRULE_PREFIX) and (i < (zlen-clen))):
        686                        #先检查 zmin 字内是否全部符合要求, 再在 zmax 范围内取得符合要求的字
        687                        ch =1
        688                        while ch <=clen:
        689                            j = i + ch
        690                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])):break
        691                            if(not self._rule_check(r1,self._get_zs(j))): break
        692                            ch+=1
        693                        if(ch <= clen): continue
        694                        #no limit znum or limit to a range
        695                        j = i +ch
        696                        while 1:
        697                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
        698                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
        699                            if( not self._rule_check(r1,self._get_zs(j))): break
        700                            clen +=1
        701                            j +=1
        702                        # 注意原来2字人名,识别后仍为2字的情况
        703                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_NR2):
        704                            if(clen ==1): continue
        705                            wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
        706                        #ok, got: i & clen
        707                        k = i + clen
        708                        wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':(PSCWS4_WORD_RULE¦PSCWS4_WORD_FULL), 'attr':r1['attr']}
        709                        wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
        711                        j = i+1
        712                        while j<=k:
        713                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
        714                            j+=1
        715                        if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART)): i =k
        716                        continue
        717                    if( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
        718                        #suffix, check before
        719                        ch = 1
        720                        while ch<=clen:
        721                            j = i -ch
        722                            if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
        723                            if(not self._rule_check(r1, self._get_zs(j))):break
        724                            ch+=1
        725                        if (ch <= clen): continue
        726                        #no limit znum or limit to a range
        727                        j = i - ch
        728                        while 1:
        729                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
        730                            if( j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
        731                            if( not self._rule_check(r1,self._get_zs(j))): break
        732                            clen +=1
        733                            j -=1
        734                        #ok, got: i & clen (maybe clen=1 & [k][i] isset)
        735                        k = i -clen
        736                        if(wmap[k][i] is not False): continue
        737                        wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
        738                        wmap[k][k]['flag']  ¦= PSCWS4_ZFLAG_WHEAD
        739                        j = k+1
        740                        while j <=i:
        741                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
        742                            if( (j != i) and (wmap[k][i] is not False) ): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
        743                            j+=1
        744                        continue
        745                #check for 'two words' (such as: 欧阳** , **西路)
        746                #print wmap[6]
        747                for i in range(zlen-2,-1,-1):
        748                    #with value ==> must be have SCWS_WORD_FULL, so needn't check it ag.
        749                    if( (wmap[i][i+1] is False) or wmap[i][i+1]['flag'] & PSCWS4_WORD_PART): continue
        750                    k = i +1
        751                    #print k
        752                    r1= self._rule_get(self._get_zs(i,k))
        753                    if(not r1): continue
        754                    clen =r1['zmin'] if r1['zmin'] else 1
        755                    if( (r1['flag'] & PSCWS4_ZRULE_PREFIX) and (k < (zlen-clen))):
        756                        ch = 1
        757                        while ch<=clen:
        758                            j = k +ch
        759                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
        760                            if(not self._rule_check(r1,self._get_zs(j))): break
        761                            ch +=1
        762                        if(ch <= clen):continue
        763                        #no limit znum or limit to a range
        764                        j = k+ch
        765                        while 1:
        766                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >=r1['zmax']))): break
        767                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
        768                            if(not self._rule_check(r1,self._get_zs(j))): break
        769                            clen +=1
        770                            j +=1
        771                        #ok, got: i & clen
        772                        k = k + clen
        773                        wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
        774                        wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
        775                        j=i+2
        776                        while j<=k:
        777                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
        778                            j+=1
        779                        i -=1
        780                        continue
        781                    if ( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
        782                        # suffix, check before
        783                        ch = 1
        784                        while ch<=clen:
        785                            j = i -ch
        786                            if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
        787                            if ( not self._rule_check(r1,self._get_zs(j))): break
        788                            ch +=1
        789                        if (ch <= clen): continue
        790                        #no limit znum or limit to a range
        791                        j = i - ch
        792                        while 1:
        793                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax'])) ): break
        794                            if(j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
        795                            if( not self._rule_check(r1,self._get_zs(j))): break
        796                            clen +=1
        797                            j -=1
        798                        #ok, got: i & clen (maybe clen=1 & [k][i] isset)
        799                        k = i - clen
        800                        i = i +1
        801                        wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
        802                        wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
        803                        j = k+1
        804                        while j<=i:
        805                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
        806                            if(wmap[k][j] is not False): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
        807                            j+=1
        808                        i -= (clen +1)
        809                        continue
        810            # do the segment really
        811            # find the easy break point
        813            j=0
        814            i=0
        815            for i in range(zlen):
        816                if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART): continue
        817                if(i > j):
        818                    self._mseg_zone(j,i-1)
        819                j = i
        820                if (not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)):
        821                    self._mset_word(i,i)
        822                    j+=1
        823            i+=1
        824            #错在这里
        825            #the lastest zone
        826            if(i > j):
        827                self._mseg_zone(j,i-1)
        828            if( (self._mode & PSCWS4_DUALITY) and (self._zis >=0) and not (self._zis & PSCWS4_ZIS_USED) ):
        829                i = self._zis
        830                self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
        832    def test(text):
        833            st = time.time()
        834            text = text
        835            for i in range(100):
        836                cws.send_text(text)
        837                while cws.get_result():
        838                    pass
        840            ret = cws.get_tops(10,'r,v,p')
        841            print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
        842            i = 0
        843            for tmp in ret:
        844                i+=1
        845                print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])
        847            print u'所花时间:',time.time()-st
        848    if __name__=='__main__':
        849        cws =  PSCWS4('gbk')
        850        cws.set_dict('dict.xdb',True)
        851        cws.set_rule('rules.ini')
        852        cws.send_text("""中国航天官员应邀到美国与太空总署官员开会 发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 北京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值""")
        853        cws.set_igonre(False) #设置忽略符号与无用字符
        854        #cws.set_debug(True) #设置是否显示分词调试信息
        855        cws.set_multi(3) #设置复合分词等级 ($level = 0,15)
        856        cws.set_duality(True) #设置是否自动将散字二元化
        857        #test("中国航天官员应邀到美国与太空总署官员开会 发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值")
        859    while 1:
        860        tmp = cws.get_result()
        861        if(not tmp):break
        862        line = ''
        863        for w in tmp:
        864            if (w['word'] == "\r"): continue
        865            if (w['word'] == "\n"):
        866                line =  line.rstrip(' ') + "\n"
        867            #else: line .= w['word'] . "/{w['attr']} "
        868            else: line += w['word'] + " "
        869        print line
        870    #t = ','
        871    #print len(t[0:2])
        872    #print ord(t[1])
        874    #    ret = cws.get_tops(10,'r,v,p')
        876    #    print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
        877    #    i = 0
        878    #    for tmp in ret:
        879    #        i+=1
        880    #       print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])

        view source
        001    #coding=gbk
        002    import os
        003    import struct
        004    import sys
        005    reload(sys)
        006    sys.setdefaultencoding('gbk')
        007    XDB_VERSION = 34 # 0x01 ~ 0xff
        008    XDB_TAGNAME = 'XDB' # First bytes
        009    XDB_MAXKLEN = 0xf0 # maxklen: < 255
        011    class XDB_R(object):
        012        fd = False
        013        hash_base = 0
        014        hash_prime = 0
        015        memread = None #内存
        016        mem = False #是否启用内存
        017        off = 0 #位置
        018        len = 0 #内存长度
        019        def __init__(self,mem=False):
        020            self.mem = mem
        021            pass
        022        def __del__(self):
        023            self.Close()
        024            pass
        025        def Open(self,fpath):
        026            self.Close()
        027            try:
        028                fd = file(fpath,'rb')
        029            except IOError:
        030                raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb failed.')
        031            else:
        032                if(self.mem):
        033                    self.memread = fd.read()
        034                    self.len = len(self.memread)
        035                self.fd = fd
        036            if( self._check_header(fd) is False):
        037                raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb format.')
        038                fd.close()
        039            return True
        040        def _read(self,size):
        041            if(self.mem):
        042                return self.memread[self.off:self.off+size]
        043            else:
        044                return self.fd.read(size)
        045        def _seek(self,seek,flag=False):
        046            if(self.mem):
        047                if self.off > self.len: raise Exception('Mem offset !')
        048                self.off = seek
        049            else:
        050                self.fd.seek(seek,flag)
        051        def _close(self):
        052            if(self.mem):
        053                self.memread = None
        054            else:
        055                self.fd.close()
        056            self.fd = False
        057        def Get(self,key):
        058            if(self.fd is False):
        059                raise Exception('XDB:Get(), null db handler.')
        060            klen = len(key)
        061            #print klen
        062            if(klen ==0 or klen > XDB_MAXKLEN):
        063                return False
        064            rec = self._get_record(key)
        065            if(not rec.has_key('vlen')  or rec['vlen'] ==0):
        066                return False
        068            return rec['value']
        069        def Close(self):
        070            if(self.fd is False):
        071                return
        072            self._close()
        073        def _get_index(self,key):
        074            l = len(key)
        075            h = self.hash_base
        076            while l:
        077                l-=1
        078                h += (h << 5)
        079                h ^= ord(key[l])
        080                h &= 0x7fffffff
        081            return (h % self.hash_prime)
        082        def _check_header(self,fd):
        083            fd.seek(0,os.SEEK_SET)
        084            buf = fd.read(32)
        085            if(len(buf) != 32): return False
        086            unpack = struct.unpack('3s B I I I f 12s',buf)
        087            if(len(unpack) <=6):
        088                unpack = list(unpack)
        089                unpack.extend(' ')
        090            hdr = {}
        091            hdr['tag'],hdr['ver'],hdr['base'],hdr['prime'],hdr['fsize'],hdr['check'],hdr['reversed'] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6]
        092            if(hdr['tag'] != XDB_TAGNAME): return False
        093            fstat = os.fstat(fd.fileno())
        094            if(fstat.st_size != hdr['fsize']): return False
        095            self.hash_base = hdr['base']
        096            self.hash_prime = hdr['prime']
        097            self.version = hdr['ver']
        098            self.fsize = hdr['fsize']
        099        def _get_record(self,key):
        100            self._io_times = 1
        101            index = self._get_index(key) if self.hash_prime > 1 else 0
        102            poff = index * 8 + 32
        103            self._seek(poff,os.SEEK_SET)
        104            buf = self._read(8)
        106            if(len(buf) ==8):
        107                tmp = struct.unpack('I I',buf)
        108                tmp = {'off':tmp[0],'len':tmp[1]}
        109            else:tmp = {'off':0,'len':0}
        110            return self._tree_get_record(tmp['off'],tmp['len'],poff,key)
        112        def _tree_get_record(self,off,len,poff =0,key =''):
        113            if(len == 0): return {'poff':poff}
        114            self._io_times+=1
        115            self._seek(off,os.SEEK_SET)
        116            rlen = XDB_MAXKLEN + 17
        118            if(rlen > len): rlen = len
        119            buf = self._read(rlen)
        120            unpack = struct.unpack('I I I I B',buf[0:17])
        121            rec = {}
        122            rec['loff'],rec['llen'],rec['roff'],rec['rlen'],rec['klen'] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4]
        124            fkey = buf[17:17+rec['klen']]
        125            cmpl = cmp(key,fkey) if(key) else 0
        126            #print key.decode('gbk'),fkey.decode('gbk')
        127            if(cmpl > 0):
        128                buf =''
        129                return self._tree_get_record(rec['roff'],rec['rlen'],off+8,key)
        130            elif (cmpl < 0):
        131                buf=''
        132                return self._tree_get_record(rec['loff'],rec['llen'],off,key)
        133            else:
        134                rec['poff'] = poff
        135                rec['off'] = off
        136                rec['len'] = len
        137                rec['voff'] = off + 17 + rec['klen']
        138                rec['vlen'] = len - 17 - rec['klen']
        139                rec['key'] = fkey
        140                self._seek(rec['voff'],os.SEEK_SET)
        141                rec['value'] = self._read(rec['vlen'])
        142                return rec
        143    #
        145    #aa = XDB_R(True)
        146    #aa.Open('./dict.xdb')
        147    #aab = aa.Get('上海')
        148    #print aab

  • 相关阅读:
    java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
    Struts2的配置文件中, <package>的作用,<action><result>重名?
    在Struts2的Action中获得request response session几种方法
  • 原文地址:https://www.cnblogs.com/lexus/p/2814786.html
Copyright © 2011-2022 走看看