zoukankan      html  css  js  c++  java
  • 使用NLTK进行基础的NLP处理

    1 import nltk
    2 from nltk.book import *
    *** Introductory Examples for the NLTK Book ***
    Loading text1, ..., text9 and sent1, ..., sent9
    Type the name of the text or sentence to view it.
    Type: 'texts()' or 'sents()' to list the materials.
    text1: Moby Dick by Herman Melville 1851
    text2: Sense and Sensibility by Jane Austen 1811
    text3: The Book of Genesis
    text4: Inaugural Address Corpus
    text5: Chat Corpus
    text6: Monty Python and the Holy Grail
    text7: Wall Street Journal
    text8: Personals Corpus
    text9: The Man Who Was Thursday by G . K . Chesterton 1908


    统计词语的数量
    1 text7
    <Text: Wall Street Journal>

    1 sent7
    ['Pierre',
     'Vinken',
     ',',
     '61',
     'years',
     'old',
     ',',
     'will',
     'join',
     'the',
     'board',
     'as',
     'a',
     'nonexecutive',
     'director',
     'Nov.',
     '29',
     '.']

    1 len(sent7)
    18

    1 len(text7)
    100676

    1 len(set(text7))
    12408

    1 list(set(text7))[:10]
    ['bottom',
     'Richmond',
     'tension',
     'limits',
     'Wedtech',
     'most',
     'boost',
     '143.80',
     'Dale',
     'refunded']

    词频
    1 dist = FreqDist(text7)
    2 len(dist)
    12408

    1 vocab1 = dist.keys()
    2 #vocab1[:10] 
    3 # In Python 3 dict.keys() returns an iterable view instead of a list
    4 list(vocab1)[:10]
    ['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

    1 dist['four']
    20

    1 freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
    2 freqwords
    ['billion',
     'company',
     'president',
     'because',
     'market',
     'million',
     'shares',
     'trading',
     'program']

    标准化和词干
    1 input1 = "List listed lists listing listings"
    #把字母都小写,再进行分词处理 2 words1 = input1.lower().split(' ') 3 words1
    ['list', 'listed', 'lists', 'listing', 'listings']

    1 porter = nltk.PorterStemmer()
    2 [porter.stem(t) for t in words1]
    ['list', 'list', 'list', 'list', 'list']

    词形还原
    1 udhr = nltk.corpus.udhr.words('English-Latin1')
    2 udhr[:20]
    ['Universal',
     'Declaration',
     'of',
     'Human',
     'Rights',
     'Preamble',
     'Whereas',
     'recognition',
     'of',
     'the',
     'inherent',
     'dignity',
     'and',
     'of',
     'the',
     'equal',
     'and',
     'inalienable',
     'rights',
     'of']

    1 [porter.stem(t) for t in udhr[:20]] # Still Lemmatization
    ['univers',
     'declar',
     'of',
     'human',
     'right',
     'preambl',
     'wherea',
     'recognit',
     'of',
     'the',
     'inher',
     'digniti',
     'and',
     'of',
     'the',
     'equal',
     'and',
     'inalien',
     'right',
     'of']

    1 WNlemma = nltk.WordNetLemmatizer()
    2 [WNlemma.lemmatize(t) for t in udhr[:20]]
    ['Universal',
     'Declaration',
     'of',
     'Human',
     'Rights',
     'Preamble',
     'Whereas',
     'recognition',
     'of',
     'the',
     'inherent',
     'dignity',
     'and',
     'of',
     'the',
     'equal',
     'and',
     'inalienable',
     'right',
     'of']

    分词和分句
    1 #根据空格分词
    2 text11 = "Children shouldn't drink a sugary drink before bed."
    3 text11.split(' ')
    ['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

    1 #nltk分词
    2 nltk.word_tokenize(text11)
    ['Children',
     'should',
     "n't",
     'drink',
     'a',
     'sugary',
     'drink',
     'before',
     'bed',
     '.']

    1 #nltk分句
    2 text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
    3 sentences = nltk.sent_tokenize(text12)
    4 len(sentences)
    4

    1 sentences
    ['This is the first sentence.',
     'A gallon of milk in the U.S. costs $2.99.',
     'Is this the third sentence?',
     'Yes, it is!']

    使用NLTK进行文本高级处理
    POS标签
    1 nltk.help.upenn_tagset('MD')
    MD: modal auxiliary
        can cannot could couldn't dare may might must need ought shall should
        shouldn't will would

    1 text13 = nltk.word_tokenize(text11)
    2 nltk.pos_tag(text13)
    [('Children', 'NNP'),
     ('should', 'MD'),
     ("n't", 'RB'),
     ('drink', 'VB'),
     ('a', 'DT'),
     ('sugary', 'JJ'),
     ('drink', 'NN'),
     ('before', 'IN'),
     ('bed', 'NN'),
     ('.', '.')]

    1 text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
    2 nltk.pos_tag(text14)
    [('Visiting', 'VBG'),
     ('aunts', 'NNS'),
     ('can', 'MD'),
     ('be', 'VB'),
     ('a', 'DT'),
     ('nuisance', 'NN')]

     1 # 解析语法结构
     2 text15 = nltk.word_tokenize("Alice loves Bob")
     3 grammar = nltk.CFG.fromstring("""
     4 S -> NP VP
     5 VP -> V NP
     6 NP -> 'Alice' | 'Bob'
     7 V -> 'loves'
     8 """)
     9 
    10 parser = nltk.ChartParser(grammar)
    11 trees = parser.parse_all(text15)
    12 for tree in trees:
    13     print(tree)
    (S (NP Alice) (VP (V loves) (NP Bob)))

    1 #读取数据
    2 text16 = nltk.word_tokenize("I saw the man with a telescope")
    3 grammar1 = nltk.data.load('mygrammar.cfg')
    4 grammar1
    <Grammar with 13 productions>

    1 #生成语法树
    2 parser = nltk.ChartParser(grammar1)
    3 trees = parser.parse_all(text16)
    4 for tree in trees:
    5     print(tree)
    (S
      (NP I)
      (VP
        (VP (V saw) (NP (Det the) (N man)))
        (PP (P with) (NP (Det a) (N telescope)))))
    (S
      (NP I)
      (VP
        (V saw)
        (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))

    1 from nltk.corpus import treebank
    2 text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
    3 print(text17)
    (S
      (NP-SBJ
        (NP (NNP Pierre) (NNP Vinken))
        (, ,)
        (ADJP (NP (CD 61) (NNS years)) (JJ old))
        (, ,))
      (VP
        (MD will)
        (VP
          (VB join)
          (NP (DT the) (NN board))
          (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
          (NP-TMP (NNP Nov.) (CD 29))))
      (. .))

    位置标记和歧义解释
    1 text18 = nltk.word_tokenize("The old man the boat")
    2 nltk.pos_tag(text18)
    [('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

    1 text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
    2 nltk.pos_tag(text19)
    [('Colorless', 'NNP'),
     ('green', 'JJ'),
     ('ideas', 'NNS'),
     ('sleep', 'VBP'),
     ('furiously', 'RB')]
  • 相关阅读:
    对数据库文件信息进行批量删除
    php多条件查询
    实现条件查询
    在PHP中设置封装类文件
    php中什么是一维数组什么是二维数组
    登录注册的学习
    git系列---【git提交代码时,文件名过长导致报错:libgit2 returned: invalid path for filesystem】
    git系列---【初始工程文件太大或者文件数太多时,向远程仓库push时总是失败,如何解决?】
    git系列---【git的撤销命令】
    git系列---【git新建分支时,已经推送到远程,发现分支名错了,如何修改分支名,并推送到远程?】
  • 原文地址:https://www.cnblogs.com/zhengzhe/p/8573075.html
Copyright © 2011-2022 走看看