zoukankan      html  css  js  c++  java
  • 结巴分词

    #!coding: utf-8
    
    import sys
    import jieba
    import jieba.posseg as pseg
    import jieba.analyse as anal
    from optparse import OptionParser
    
    usage = "usage: python %prog [--tfidf topK] [--textr topK]";
    parser = OptionParser(usage);
    parser.add_option("--tag", dest="tag", action="store_true");
    parser.add_option("--fast", dest="fast", action="store_true");
    parser.add_option("--tfidf", dest="tfidf");
    parser.add_option("--textr", dest="textr");
    parser.add_option("--stopdict", dest="stopdict");
    opt,args = parser.parse_args();
    
    def wordFilter(wordlist):
        if opt.stopdict:
            with open(opt.stopdict,"r") as f:
                stopList = f.read().strip().split("
    ");
        else:
            print "please special stopword file path";
    
        returnlist = [];
    
        for word in wordlist:
            if word:
                word = word.encode("utf-8");
                if word not in stopList:
                    returnlist.append(word);
    
        return returnlist;
    
    
    
    def wordPosFilter(wordlist):
        if opt.stopdict:
            with open(opt.stopdict,"r") as f:
                stopList = f.read().strip().split("
    ");
        else:
            stopList = [];
    
        returnlist = [];
        save_post = ["an","n","nr","ns","nt","nz","v","vd","eng","ni"];
    
        for w in wordlist:
            word = w.word.encode("utf-8");
            pos = w.flag;
            if word not in stopList and pos in save_post:
                returnlist.append(word);
    
        return returnlist;
    
    txt = "支持三种分词模式: 精确模式,试图将句子最精确地切开,适合文本分析; 
    全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义; 
    搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。  
    支持繁体分词 支持自定义词典 MIT 授权协议 在线演示";
    
    
    #multiprocess
    if opt.fast:
        jieba.enable_parallel(10);
    
    
    
    #define word-dict
    jieba.add_word("全模式");
    jieba.suggest_freq(("",""), True) ;
    
    #jieba.load_userdict(dictfilepath);
    
    #generator
    #print "/".join(jieba.cut(txt));
    
    with open("dict/stopword.txt") as f:
        stoplist = f.read().strip().split("
    ");
    
    
    rest = jieba.lcut(txt);
    
    print "/".join(rest);
    print "=========================  filter ===========================";
    rest = wordFilter(rest);
    print "/".join(rest);
    
    
    psss = pseg.lcut(txt);
    
    print "=========================  posFilter  ===========================";
    psss = wordPosFilter(psss);
    print "/".join(psss);
    
    
    sys.exit();
    
    #list
    print "/".join(jieba.lcut(txt));
    
    
    #search mode
    print "/".join(jieba.cut_for_search(txt));
    
    
    #get word's position
    res = jieba.tokenize(txt.decode("utf-8"));
    #res = jieba.tokenize(txt.decode("utf-8"), mode="search"); #search mode
    print "word		start		end";
    for tk in res:
        print("%s		 %d 		 %d" % (tk[0],tk[1],tk[2]));
    
    
    
    #tagging word
    if opt.tag:
        for w,k in pseg.cut(txt):
            print w+"("+k+")",
    
    
    
    #tfidf sort keyword
    if opt.tfidf:
        topK =  int(opt.tfidf);
        tags = anal.extract_tags(txt, topK, withWeight=True);
    
        for word,weight in tags:
            print word,weight
    
    
    
    #textrank sort keyword
    if opt.textr:
        topk = int(opt.textr);
        tags = anal.textrank(txt, topk, withWeight=True);
    
        for word,weight in tags:
            print word,weight;
  • 相关阅读:
    IP 封包中的 Header 的 Protocol 字段的 值
    二叉树算法题
    PageFile Swap File
    Quick sort C# code(2)
    VS中Sos调试扩展简介 (转帖)
    BUG: "Old format or invalid type library" error when automating Excel on 64 bit server 2008
    Sql server 2005 connection string
    让IE支持自己的协议
    偶然间,我发现了一个秘密能使盗版的windowsXP变成正版
    Excel C# Automation
  • 原文地址:https://www.cnblogs.com/demonxian3/p/9173886.html
Copyright © 2011-2022 走看看