zoukankan      html  css  js  c++  java
  • NLP(一)语料库和WordNet

    原文链接:http://www.one2know.cn/nlp1/

    import nltk
    from nltk.corpus import reuters
    
    # 下载路透社语料库
    nltk.download('reuters')
    
    # 查看语料库的内容
    files = reuters.fileids()
    print(files)
    
    # 访问其中一个文件的内容
    words14826 = reuters.words(['test/14826'])
    print(words14826[:20])
    
    # 输出主题(一共90个)
    reutersGenres = reuters.categories()
    print(reutersGenres)
    
    # 访问一个主题,一句话一行输出
    for w in reuters.words(categories=['tea']):
        print(w + ' ',end='')
        if w is '.':
            print()
    
    from nltk.corpus import CategorizedPlaintextCorpusReader
    
    # 读取语料库
    reader = CategorizedPlaintextCorpusReader(r'D:PyCharm 5.0.3WorkSpace2.NLP语料库1.movie_review_data_1000	xt_sentoken',r'.*.txt',cat_pattern=r'(w+)/*')
    print(reader.categories())
    print(reader.fileids())
    
    # 语料库分成两类
    posFiles = reader.fileids(categories='pos')
    negFiles = reader.fileids(categories='neg')
    
    # 从posFiles或negFiles随机选择一个文件
    from random import randint
    fileP = posFiles[randint(0,len(posFiles)-1)]
    fileN = negFiles[randint(0,len(negFiles)-1)]
    
    # 逐句打印随机的选择文件
    for w in reader.words(fileP):
        print(w + ' ',end='')
        if w is '.':
            print()
    for w in reader.words(fileN):
        print(w + ' ',end='')
        if w is '.':
            print()
    

    CategorizedPlaintextCorpusReader类通过参数的设置,从内部将样本加载到合适的位置

    • 语料库中的词频计算和计数分布分析
      以布朗语料库为例:布朗大学 500个文本 15个类
    import nltk
    from nltk.corpus import brown
    
    nltk.download('brown')
    
    # 查看brown中的类别
    print(brown.categories())
    
    # 挑选出三种类别,并获取其中的疑问词
    genres = ['fiction','humor','romance']
    whwords = ['what','which','how','why','when','where','who']
    
    # 迭代器分别分析3种类
    for i in range(0,len(genres)):
        genre = genres[i]
        print()
        print("Analysing '"+ genre + "' wh words")
        genre_text = brown.words(categories = genre)
        print(genre_text)
    
        # 返回输入单词对象的wh类及对应的频率
        fdist = nltk.FreqDist(genre_text)
        for wh in whwords:
            print(wh + ':',fdist[wh],end='  ')
        print()
    

    输出:

    ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
    
    Analysing 'fiction' wh words
    ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]
    what: 128  which: 123  how: 54  why: 18  when: 133  where: 76  who: 103  
    
    Analysing 'humor' wh words
    ['It', 'was', 'among', 'these', 'that', 'Hinkle', ...]
    what: 36  which: 62  how: 18  why: 9  when: 52  where: 15  who: 48  
    
    Analysing 'romance' wh words
    ['They', 'neither', 'liked', 'nor', 'disliked', 'the', ...]
    what: 121  which: 104  how: 60  why: 34  when: 126  where: 54  who: 89  
    
    • 网络文本和聊天文本的词频分布
    import nltk
    from nltk.corpus import webtext
    
    # nltk.download('webtext')
    print(webtext.fileids())
    
    # 选择一个数据文件,并计算频率分布,获得FreqDist的对象fdist
    fileid = 'singles.txt'  # 个人广告
    wbt_words = webtext.words(fileid)
    fdist = nltk.FreqDist(wbt_words)
    
    # 获取高频单词及其计数
    print('最多出现的词 "' , fdist.max() , '" :' , fdist[fdist.max()])
    
    # 获取所有单词的计数
    print(fdist.N())
    
    # 找出最常见的10个词
    print(fdist.most_common(10))
    
    # 将单词和频率制成表格
    print(fdist.tabulate(5))
    
    # 将单词和频率制成分布图
    fdist.plot(cumulative=True) # 计数显示,cumulative=percents为百分比显示
    

    输出:

    ['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']
    最多出现的词 " , " : 539
    4867
    [(',', 539), ('.', 353), ('/', 110), ('for', 99), ('and', 74), ('to', 74), ('lady', 68), ('-', 66), ('seeks', 60), ('a', 52)]
      ,   .   / for and 
    539 353 110  99  74 
    None
    

    累计计数分布图:

    • 使用WordNet获取一个词的不同含义
    # import nltk
    # nltk.download('wordnet')
    
    from nltk.corpus import wordnet as wn
    chair = 'chair'
    
    # 输出chair的各种含义
    chair_synsets = wn.synsets(chair)
    print('Chair的意思:',chair_synsets,'
    
    ')
    
    # 迭代输出 含义,含义的定义,同义词条,例句
    for synset in chair_synsets:
        print(synset,': ')
        print('Definition: ',synset.definition())
        print('Lemmas/Synonymous words: ',synset.lemma_names())
        print('Example: ',synset.examples(),'
    ')
    

    输出:

    Chair的意思: [Synset('chair.n.01'), Synset('professorship.n.01'), Synset('president.n.04'), Synset('electric_chair.n.01'), Synset('chair.n.05'), Synset('chair.v.01'), Synset('moderate.v.01')] 
    
    Synset('chair.n.01') : 
    Definition:  a seat for one person, with a support for the back
    Lemmas/Synonymous words:  ['chair']
    Example:  ['he put his coat over the back of the chair and sat down'] 
    
    Synset('professorship.n.01') : 
    Definition:  the position of professor
    Lemmas/Synonymous words:  ['professorship', 'chair']
    Example:  ['he was awarded an endowed chair in economics'] 
    
    Synset('president.n.04') : 
    Definition:  the officer who presides at the meetings of an organization
    Lemmas/Synonymous words:  ['president', 'chairman', 'chairwoman', 'chair', 'chairperson']
    Example:  ['address your remarks to the chairperson'] 
    
    Synset('electric_chair.n.01') : 
    Definition:  an instrument of execution by electrocution; resembles an ordinary seat for one person
    Lemmas/Synonymous words:  ['electric_chair', 'chair', 'death_chair', 'hot_seat']
    Example:  ['the murderer was sentenced to die in the chair'] 
    
    Synset('chair.n.05') : 
    Definition:  a particular seat in an orchestra
    Lemmas/Synonymous words:  ['chair']
    Example:  ['he is second chair violin'] 
    
    Synset('chair.v.01') : 
    Definition:  act or preside as chair, as of an academic department in a university
    Lemmas/Synonymous words:  ['chair', 'chairman']
    Example:  ['She chaired the department for many years'] 
    
    Synset('moderate.v.01') : 
    Definition:  preside over
    Lemmas/Synonymous words:  ['moderate', 'chair', 'lead']
    Example:  ['John moderated the discussion'] 
    
    • 上位词和下位词
      下位词更具体,上位词更一般(泛化)
      bed.n.01woman.n.01为例:
    from nltk.corpus import wordnet as wn
    
    woman = wn.synset('woman.n.01')
    bed = wn.synset('bed.n.01')
    
    # 返回据有直系关系的同义词集,上位词!
    print(woman.hypernyms())
    woman_paths = woman.hypernym_paths()
    
    # 打印从根节点到woman.n.01的所有路径
    for idx,path in enumerate(woman_paths):
        print('
    
    Hypernym Path :',idx+1)
        for synset in path:
            print(synset.name(),',',end='')
    
    # 更具体的术语,下位词!
    types_of_bed = bed.hyponyms()
    print('
    
    Types of beds(Hyponyms): ',types_of_bed)
    
    # 打印出更有意义的lemma(词条)
    print('
    ',sorted(set(lemma.name() for synset in types_of_bed for lemma in synset.lemmas())))
    

    输出:

    [Synset('adult.n.01'), Synset('female.n.02')]
    
    Hypernym Path : 1
    entity.n.01 ,physical_entity.n.01 ,causal_agent.n.01 ,person.n.01 ,adult.n.01 ,woman.n.01 ,
    
    Hypernym Path : 2
    entity.n.01 ,physical_entity.n.01 ,object.n.01 ,whole.n.02 ,living_thing.n.01 ,organism.n.01 ,person.n.01 ,adult.n.01 ,woman.n.01 ,
    
    Hypernym Path : 3
    entity.n.01 ,physical_entity.n.01 ,causal_agent.n.01 ,person.n.01 ,female.n.02 ,woman.n.01 ,
    
    Hypernym Path : 4
    entity.n.01 ,physical_entity.n.01 ,object.n.01 ,whole.n.02 ,living_thing.n.01 ,organism.n.01 ,person.n.01 ,female.n.02 ,woman.n.01 ,
    
    Types of beds(Hyponyms):  [Synset('berth.n.03'), Synset('built-in_bed.n.01'), Synset('bunk.n.03'), Synset('bunk_bed.n.01'), Synset('cot.n.03'), Synset('couch.n.03'), Synset('deathbed.n.02'), Synset('double_bed.n.01'), Synset('four-poster.n.01'), Synset('hammock.n.02'), Synset('marriage_bed.n.01'), Synset('murphy_bed.n.01'), Synset('plank-bed.n.01'), Synset('platform_bed.n.01'), Synset('sickbed.n.01'), Synset('single_bed.n.01'), Synset('sleigh_bed.n.01'), Synset('trundle_bed.n.01'), Synset('twin_bed.n.01'), Synset('water_bed.n.01')]
    
     ['Murphy_bed', 'berth', 'built-in_bed', 'built_in_bed', 'bunk', 'bunk_bed', 'camp_bed', 'cot', 'couch', 'deathbed', 'double_bed', 'four-poster', 'hammock', 'marriage_bed', 'plank-bed', 'platform_bed', 'sack', 'sickbed', 'single_bed', 'sleigh_bed', 'truckle', 'truckle_bed', 'trundle', 'trundle_bed', 'twin_bed', 'water_bed']
    
    • 基于WordNet计算某种词性的多义性
      以名词n为例:
    from nltk.corpus import wordnet as wn
    
    type = 'n' #动词v,副词r,形容词a
    
    # 返回WordNet中所有type类型的同义词集
    sysnets = wn.all_synsets(type)
    
    # 将所有词条合并成一个大list
    lemmas = []
    for sysnet in sysnets:
        for lemma in sysnet.lemmas():
            lemmas.append(lemma.name())
    
    # 删除重复词条,list=>set
    lemmas = set(lemmas)
    
    # 计算每个词条type类型的含义数并加到一起
    count = 0
    for lemma in lemmas:
        count = count + len(wn.synsets(lemma,type)) # lemma在type类型下的所有含义
    
    # 打印所有数值
    print('%s总词条数: '%(type),len(lemmas))
    print('%s总含义数: '%(type),count)
    print('%s平均多义性: '%(type),count/len(lemmas))
    

    输出:

    n总词条数:  119034
    n总含义数:  152763
    n平均多义性:  1.2833560159282222
    
  • 相关阅读:
    Mac OS中使用VScode配置C语言开发环境
    图形数据库(GraphDB)
    将博客搬至CSDN
    Nodejs课堂笔记-第四课 Dynamodb为何物
    Nodejs课堂笔记-第三课 构建一个nodejs的Docker镜像
    Nodejs课堂笔记-第二课 package.json的作用
    MyBatis 3(中文版) 第四章 使用注解配置SQL映射器
    idea hibernate jpa 生成实体类
    Maven常用配置
    spring发送邮件
  • 原文地址:https://www.cnblogs.com/peng8098/p/nlp_1.html
Copyright © 2011-2022 走看看