zoukankan      html  css  js  c++  java
  • 5 分类和标注词汇

    1.词性标注器 parts-of-speech 或 POS tagger nltk自带英文标注器

    import nltk
    text = nltk.word_tokenize("And now for something completely different")#list
    print(nltk.pos_tag(text))
    print(nltk.help.upenn_tagset('RB'))#标记的文档
    text1 = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
    print(type(text1))#<class 'nltk.text.Text'>
    text1.similar('bought')
    
     结果如下:
    
    [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]
    RB: adverb
        occasionally unabatingly maddeningly adventurously professedly
        stirringly prominently technologically magisterially predominately
        swiftly fiscally pitilessly ...
    None
    <class 'nltk.text.Text'>
    made said done put had seen found given left heard was been brought
    set got that took in told felt
    

    2.标注语料库

    表示已经标注的标识符tagged_token = nltk.tag.str2tuple('fly/NN')#表示已经标注的标识符print(tagged_token)#('fly', 'NN')

    sent = """the/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN other/AP topics/NNS."""
    res = [nltk.tag.str2tuple(t) for t in sent.split()]
    print(res)
    结果如下:
    ('fly', 'NN')
    [('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),
    ('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ('of', 'IN'), ('other', 'AP'), ('topics', 'NNS.')]
    

    读取已经标注的语料库print(nltk.corpus.brown.tagged_words())

    #[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

    print(nltk.corpus.indian)
    from nltk.corpus import brown
    browd_news_tagged = brown.tagged_words(categories = 'news',tagset = 'universal')
    print(browd_news_tagged)
    tag_fd = nltk.FreqDist(tag for (word,tag) in browd_news_tagged)
    print(tag_fd.keys())
    tag_fd.plot(cumulative = True)#频率分布图
    
    结果如下:
    [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
    <IndianCorpusReader in '.../corpora/indian' (not loaded yet)>
    [('The', 'DET'), ('Fulton', 'NOUN'), ...]
    dict_keys(['DET', 'NOUN', 'ADJ', 'VERB', 'ADP', '.', 'ADV', 'CONJ', 'PRT', 'PRON', 'NUM', 'X'])
    

    名词、动词、形容词等

    名词
    word_tag_pairs = nltk.bigrams(browd_news_tagged)
    print(word_tag_pairs)
    res1 = list(nltk.FreqDist(a[1] for (a,b) in word_tag_pairs if b[1] == 'N'))
    print(res1)
    
    from nltk.corpus import brown
    word_tag = nltk.FreqDist(brown.tagged_words(categories="news"))
    print([word+'/'+tag for (word,tag)in word_tag if tag.startswith('V')])
    wsj = brown.tagged_words(categories="news")cfd = nltk.ConditionalFreqDist(wsj)print(cfd['money'].keys()) 
    
    结果如下:
    <generator object bigrams at 0x00000240463ABE08>
    []
    ['said/VBD', 'produced/VBD', 'took/VBD', 'deserves/VBZ', 'conducted/VBN', 'charged/VBN',...]
    dict_keys(['NN'])
    
    动词
    word_tag_fd = nltk.FreqDist(wsj)
    res2 = [word + '/' + tag for (word,tag) in word_tag_fd if tag.startswith('V')]
    print(res2)
    
    wsj = brown.tagged_words()
    cfd1 = nltk.ConditionalFreqDist(wsj)
    print(cfd1['money'].keys())
    print(cfd1.conditions())#所有的单词
    
    结果如下:
    ['said/VBD', 'produced/VBD', 'took/VBD', 'deserves/VBZ', 'conducted/VBN', 'charged/VBN',...]
    dict_keys(['NN', 'NN-HL'])
    ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's",...]
    
    # 找到同是VD和VN的词汇
    res2 = [w for w in cfd1.conditions() if 'VD' in cfd1[w] and 'VN' in cfd1[w]]
    # print(res2)
    idx1 = wsj.index(('kicked','VD'))
    print(idx1)
    

    尝试找出每个名词类型中最频繁的名词

    def findtags(tag_prefix,tagged_text):
     cfd = nltk.ConditionalFreqDist((tag,word) for (word,tag)
    in tagged_text if tag.startswith(tag_prefix))
    # print(cfd.conditions())#['NN-TL', 'NN', 'NNS', 'NN-HL', 'NN$-TL', 'NN$', 'NNS-HL', 'NNS-TL', 'NNS$', 'NNS$-TL', 'NN-TL-HL', 'NNS-TL-HL', 'NN$-HL', 'NNS$-HL', 'NN-NC']
     return dict((tag,list(cfd[tag].keys())[:5]) for tag in cfd.conditions())
    tagdict = findtags('NN',nltk.corpus.brown.tagged_words(categories = 'news'))
    print(tagdict)#{'NN-TL': ['County', 'Jury', 'City', 'Committee', 'Court'], 'NN': ['investigation', 'primary', 'election', 'evidence', 'place'],...}
    for tag in sorted(tagdict):
    print(tag,tagdict[tag])
    
    结果如下:
    {'NN-TL': ['County', 'Jury', 'City', 'Committee', 'Court'],
    'NN': ['investigation', 'primary', 'election', 'evidence', 'place'],
    'NNS': ['irregularities', 'presentments', 'thanks', 'reports', 'voters'],...]
    NN ['investigation', 'primary', 'election', 'evidence', 'place']
    NN$ ["ordinary's", "court's", "mayor's", "wife's", "governor's"]
    NN$-HL ["Golf's", "Navy's"]
    NN$-TL ["Department's", "Commissioner's", "President's", "Party's", "Mayor's"]
    

    探索已经标注的语料库

    brown_learned_text = brown.words(categories = 'learned')
    res3 = sorted(set(b for (a,b) in nltk.bigrams(brown_learned_text) if a == 'often'))
    # print(res3)#[',', '.', 'accomplished', 'analytically', 'appear', 'apt',...]brown_irnd_tagged = brown.tagged_words(categories = 'learned')
    tags = [b[1] for (a,b) in nltk.bigrams(brown_irnd_tagged) if a[0] == 'often']
    print(tags)
    fd = nltk.FreqDist(tags)
    fd.tabulate()
    
    结果如下:
    ['AP', 'QL', 'VB', 'VBD', 'JJ', ',', 'VB', 'VBN', 'VBN', 'VBD', ',', 'VBN',
    'CS', 'VBN', 'VB', 'VBN', 'VBG', 'IN', 'QL', 'RP', 'VBD', 'VBD', 'RB', 'VB',
    'VBD', 'VB', 'VBD', 'CS', ',', 'CS', 'VBN', 'VB', 'RB', 'VB', 'QL', 'JJ', 'IN',
    'RB', 'VBN', 'JJ', 'VBZ', 'VBN', 'VBN', 'VB', 'VBN', 'QLP', 'BEN', 'VBD', 'JJ',
    'VBD', 'IN', 'IN', 'WRB', 'VB', '.', 'TO', 'VB', 'VBN', 'VBN', 'VBN', 'JJ', 'VBN',
    'VBN', 'HV']
    VBN VB VBD JJ IN QL , CS RB AP VBG RP VBZ QLP BEN WRB . TO HV
    15 10 8 5 4 3 3 3 3 1 1 1 1 1 1 1 1 1 1
    
    # 使用POS标记寻找三词短语
    
    from nltk.corpus import brown
    def process(sentence):
    for (w1,t1),(w2,t2),(w3,t3) in nltk.trigrams(sentence):
    if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
    print(w1,w2,w3)
    for tagged_sent in brown.tagged_sents():
     process(tagged_sent)
    brown_news_tagged = brown.tagged_words(categories = 'news')
    data = nltk.ConditionalFreqDist((word.lower(),tag) for (word,tag) in brown_news_tagged)
    # print(data.conditions())#['the', 'fulton', 'county', 'grand']
    for word in data.conditions():
    if len(data[word]) > 3:
     tags = data[word].keys()
    print(word,' '.join(tags))# no AT RB AT-HL AT-TL
     # that CS WPS DT QL WPO
    
        结果如下:
    combined to achieve
    continue to place
    serve to protect
    wanted to wait
    allowed to place
    expected to become ...
    
    使用Python字典映射词及其属性
    frequency = nltk.defaultdict(int)
    print(frequency['color'])#默认为0alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
    vocab = nltk.FreqDist(alice)
    v1000 = sorted(list(vocab))[:1000]
    mapping = nltk.defaultdict(lambda : 'UNK')
    for v in v1000:
     mapping[v] = v
     alice2 = [mapping[v] for v in alice]
     print(alice2[:100])
    
    结果如下:
    0
    ['[', 'Alice', "'", 'UNK', 'Adventures', 'UNK', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', ']', 'CHAPTER', 'I', '.', 'Down', 'UNK', 'Rabbit', '-', 'Hole', 'Alice', 'UNK', 'beginning', 'UNK',
    'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'by', 'UNK', 'UNK', 'UNK', 'UNK', 'bank', ',', 'and', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', ':', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'book', 'UNK',
    'UNK', 'UNK', 'UNK', ',', 'but', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', ',', "'", 'and', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'a', 'book', ",'", 'UNK', 'Alice', "'", 'UNK', 'UNK',
    'UNK', 'conversation', "?'", 'So', 'UNK', 'UNK', 'considering', 'UNK', 'UNK', 'UNK', 'UNK', '(', 'as', 'UNK', 'as', 'UNK', 'UNK', ',']
    
    递增地更新字典
    counts = nltk.defaultdict(int)
    from nltk.corpus import brown
    for (word,tag) in brown.tagged_words(categories = 'news'):
     counts[tag] += 1
    print(counts)
    
    from operator import itemgetter
    res5 = sorted(counts.items(), key=itemgetter(1),reverse=True)#按字典的value排序
    res6 = [t for t,c in sorted(counts.items(),key=itemgetter(1),reverse=True)]
    print(res6)#['NN', 'IN', 'AT', 'NP',...]
    
    结果如下:
    defaultdict(<class 'int'>, {'AT': 8893, 'NP-TL': 741, 'NN-TL': 2486, 'JJ-TL': 689, 'VBD': 2524,
    'NR': 495, 'NN': 13162,...}
    
    #通过最后两个字母索引词汇
    last_letters = nltk.defaultdict(list)
    words = nltk.corpus.words.words('en')
    for word in words:
     key = word[-2:]
     last_letters[key].append(word)
    print(last_letters['ly'])
    
    anagrams = nltk.defaultdict(list)
    for word in words:
     key = ''.join(sorted(word))
     anagrams[key].append(word)
    print(anagrams['aeilnrt'])#输出所有排序之后为‘aeilnrt’的单词
    #复杂的键和值
    pos = nltk.defaultdict(lambda : nltk.defaultdict(int))
    brown_news_tagged = brown.tagged_words(categories = 'news')
    for ((w1,t1),(w2,t2)) in nltk.bigrams(brown_news_tagged):
     pos[(t1,w2)][t2] += 1
    print(pos[('DET','right')])
    #颠倒字典
    counts = nltk.defaultdict(int)for word in nltk.corpus.gutenberg.words('milton-paradise.txt'): counts[word] += 1
    res7 = [key for (key,value) in counts.items() if value == 32]print(res7)pos.update({'cats':'N'})pos2 = nltk.defaultdict(list)for key,value in pos.items(): pos2[value].append(key)print(pos2['N'])pos3 = nltk.Index((value,key) for (key,value) in pos.items())print(pos3['N'])
    

    3.自动标注

    默认标注器

    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories = 'news')
    brown_sents = brown.sents(categories = 'news')
    # 默认标注器
    tags = [tag for (word,tag) in brown.tagged_words(categories = 'news')]
    print(nltk.FreqDist(tags).max())#NN
    raw = 'I do not like eggs and ham, I do not like them Sam I am!'
    tokens = nltk.word_tokenize(raw)
    default_tagger = nltk.DefaultTagger('NN')#将所有词都变成NN的标注器
    print(default_tagger.tag(tokens))#调用tag()方法进行标注
    print(default_tagger.evaluate(brown_tagged_sents))#0.13089484257215028 用evaluate()进行检验
    

    正则表达式标注器

    #正则表达式标注器 注意这里规则是固定(由自己决定)。当规则越来越完善的时候,精确度越高。
    patterns = [
     (r'.*ing$','VBG'),
     (r'.*ed$','VBD'),
     (r'.*es$','VBZ'),
     (r'.*','NN')
    ]
    regexp_tagger = nltk.RegexpTagger(patterns)
    res8 = regexp_tagger.tag(brown_sents[3])#[('``', None), ('Only', None), ('a', None),...]res9 = regexp_tagger.evaluate(brown_tagged_sents)#0.030152952642361317
    

    查询标注器

    #查询标注器
    fd = nltk.FreqDist(brown.words(categories = 'news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories = 'news'))
    most_freq_words = list(fd.keys())[:100]
    likely_tags = dict((word,cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    res10 = baseline_tagger.evaluate(brown_tagged_sents)#0.3329355371243312
    # print(res10)sent = brown.sents(categories = 'news')
    # print(baseline_tagger.tag(sent))baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))#回退 如果不能指定标记就使用默认标注器
    res11 = baseline_tagger.evaluate(brown_tagged_sents)
    print(res11)
    #查找标注器的性能,使用不同大小的模型
    def performance(cfd,wordlist):
     lt = dict((word,cfd[word].max()) for word in wordlist)
     baseline_tagger = nltk.UnigramTagger(model=lt,backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories = 'news'))
    def display():import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories = 'news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories = 'news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd,words_by_freq[:size]) for size in sizes] pylab.plot(sizes,perfs,'-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()# display()
    

    4.N-gram标注

    基础的一元标注器

    #一元标注器(Unigram Tagging)
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories = 'news')
    brown_sents = brown.sents(categories = 'news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    res12 = unigram_tagger.tag(brown_sents[2007])
    print(res12)#[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'),size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:size]
    test_sents = brown_tagged_sents[size:]
    unigram_tagger = nltk.UnigramTagger(train_sents)
    res13 = unigram_tagger.evaluate(test_sents)
    print(res13)
    

    一般的N-gram标注器

    #一般的N-gram标注
    bigram_tagger = nltk.BigramTagger(train_sents)
    res14 = bigram_tagger.tag(brown_sents[2007])
    print(res14)#[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'),...]
    # bigram标注器能够标注训练中它看到过的句子中的所有词,但对于一个没见过的句子却不行,只要遇到一个新词,就无法给它分配
    unseen_sent = brown_sents[1006]
    res15 = bigram_tagger.tag(unseen_sent)
    print(bigram_tagger.evaluate(test_sents))#0.10206319146815508
    print(res15)#[('A', 'AT'), ('capsule', 'NN'),
    t0 = nltk.DefaultTagger('NN')t1 = nltk.UnigramTagger(train_sents,backoff=t0)t2 = nltk.BigramTagger(train_sents,backoff=t1)res16 = t2.evaluate(test_sents)print(res16)#0.8452108043456593
    t3 = nltk.TrigramTagger(train_sents,backoff=t2)res17 = t3.evaluate(test_sents)print(res17)#0.843317053722715
    #存储标注器
    from pickle import dumpoutput = open('t2.pkl','wb')dump(t2,output,-1)output.close()
    from pickle import loadinput_ = open('t2.pkl','rb')tagger = load(input_)input_.close()
    
    text = """The board's actions shows what free enterprise is up against in our complex maze of regulatory laws."""
    tokens = text.split()res18 = tagger.tag(tokens)print(res18) #[('The', 'AT'), ("board's", 'NN$'), ('actions', 'NNS'),]
    #性能限制
    cfd = nltk.ConditionalFreqDist(((x[1],y[1],z[0],z[1])for sent in brown_tagged_sents   for x,y,z in nltk.trigrams(sent)))      ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]res19 = sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()print(res19)
    

    组合标注器

    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents,backoff=t0)
    t2 = nltk.BigramTagger(train_sents,backoff=t1)
    res16 = t2.evaluate(test_sents)
    print(res16)#0.8452108043456593
    t3 = nltk.TrigramTagger(train_sents,backoff=t2)
    res17 = t3.evaluate(test_sents)
    print(res17)#0.843317053722715
    

    跨句子边界标注

    #跨句子边界标注:使用已标注句子的链表来训练 对于句首的单词,没有前n个单词。解决方法:通过已标记的tagged_sents来训练标注器
    
    brown_tagged_sents = brown.tagged_sents(categories = 'news')
    brown_sents = brown.sents(categories = 'news')
    size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:size]
    test_sents = brown_tagged_sents[size:]
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents,backoff=t0)
    t2 = nltk.BigramTagger(train_sents,backoff=t1)
    res16 = t2.evaluate(test_sents)
    print(res16)#0.8452108043456593
    

    5.基于转换的标注:Brill标注器

    # 较上面的都优秀。实现的思路:以大笔化开始,然后修复细节,一点点进行细致改变。
    # 不仅占用内存小,而且关联上下文,并且根据问题的变小,实时修正错误,而不是一成不变的
    from nltk.tag import brill
    print(brill.nltkdemo18plus())
    brill.nltkdemo18()
    
  • 相关阅读:
    四则运算出题器
    四则运算出题网页
    四则运算自动生成器实现(python、wxpython、GUI)
    python 实现小学四则运算
    Process and Thread States
    COS AP-开启WPA后无法关联SSID!
    WLC MAC Filtering
    禅道--个人理解 简单介绍
    IDEA解决乱码
    avue 实现自定义列显隐并保存,并且搜索表单、form表单、crud列顺序互不影响。
  • 原文地址:https://www.cnblogs.com/nxf-rabbit75/p/9571373.html
Copyright © 2011-2022 走看看