zoukankan      html  css  js  c++  java
  • Python for Data Science

    Chapter 6 - Data Sourcing via Web

    Segment 5 - Introduction to NLP

    import nltk
    
    text = "On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks. The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share."
    
    nltk.set_proxy('http://192.168.2.16:1080')
    nltk.download('punkt')
    
    [nltk_data] Downloading package punkt to /home/ericwei/nltk_data...
    [nltk_data]   Package punkt is already up-to-date!
    
    
    
    
    
    True
    

    Sentence Tokenizer

    from nltk.tokenize import sent_tokenize
    sent_tk = sent_tokenize(text)
    print("Sentence tokenizing the text: 
    ")
    print(sent_tk)
    
    Sentence tokenizing the text: 
    
    ['On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks.', 'The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share.']
    

    Word Tokenizer

    from nltk.tokenize import word_tokenize
    word_tk = word_tokenize(text)
    print("Word tokenizing the text: 
    ")
    print(word_tk)
    
    Word tokenizing the text: 
    
    ['On', 'Wednesday', ',', 'the', 'Association', 'for', 'Computing', 'Machinery', ',', 'the', 'world', '’', 's', 'largest', 'society', 'of', 'computing', 'professionals', ',', 'announced', 'that', 'Hinton', ',', 'LeCun', 'and', 'Bengio', 'had', 'won', 'this', 'year', '’', 's', 'Turing', 'Award', 'for', 'their', 'work', 'on', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'which', 'was', 'introduced', 'in', '1966', ',', 'is', 'often', 'called', 'the', 'Nobel', 'Prize', 'of', 'computing', ',', 'and', 'it', 'includes', 'a', '$', '1', 'million', 'prize', ',', 'which', 'the', 'three', 'scientists', 'will', 'share', '.']
    

    Removing stop words

    nltk.download('stopwords')
    
    [nltk_data] Downloading package stopwords to
    [nltk_data]     /home/ericwei/nltk_data...
    [nltk_data]   Unzipping corpora/stopwords.zip.
    
    
    
    
    
    True
    
    from nltk.corpus import stopwords
    
    sw = set(stopwords.words("english"))
    print("Stop words in English language are: 
    ")
    print(sw)
    
    Stop words in English language are: 
    
    {'each', "weren't", 'just', 'on', 'o', 'all', "won't", 'how', 'own', 'didn', 'shouldn', 'will', 'out', 'against', 'off', 'very', 'now', 'that', 'weren', 'if', 'ain', 'ma', 'it', 'the', 'i', 'yourself', "hadn't", 'needn', 'have', "she's", 'an', 'he', 'because', 'for', 'few', "mustn't", 'than', 'don', 'and', 'other', 'were', 'should', 're', 'there', 'll', 'down', 'couldn', 'herself', 'then', "needn't", 'my', 'is', 'she', 'with', 'where', 'having', 'from', 'himself', "haven't", "isn't", 'after', 'no', 'has', 'am', 'does', 'between', 'a', 'mustn', 'did', 'being', 'at', 'doesn', "couldn't", 'y', 'yourselves', 's', 'who', 'until', 'what', 'myself', 'hers', 'those', "you've", "you'd", 'mightn', 'above', 'had', 'themselves', 'any', 'more', "hasn't", 'during', "doesn't", 'aren', 'these', 'hadn', 'whom', 'are', 'won', 'through', 'hasn', 'further', "don't", "wouldn't", "mightn't", 'too', 'why', 'itself', 'm', 'most', 'such', "you're", 'to', 'while', 'over', 'nor', 'ourselves', 'doing', 'they', "wasn't", 'been', 'shan', 'do', 'd', 'up', 'was', "didn't", 'some', "shouldn't", 'so', "it's", 'me', 'again', "should've", 'them', 'but', 'same', 'or', "aren't", 'her', 'below', 'wasn', 'be', "that'll", 'him', 'in', 'when', 'about', 'as', 'can', 'our', 'under', 'both', 'once', 'before', 'their', 'wouldn', 'here', 've', 'which', 'his', 'not', 'isn', 'theirs', 'only', 'its', 'we', 'of', 'you', "you'll", 'by', 'haven', "shan't", 'this', 'ours', 'yours', 't', 'your', 'into'}
    
    filtered_words = [w for w in word_tk if not w in sw]
    
    print("The text after removing stop words 
    ")
    print(filtered_words)
    
    The text after removing stop words 
    
    ['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.']
    

    Stemming

    from nltk.stem import PorterStemmer
    from nltk.tokenize import sent_tokenize, word_tokenize
    
    port_stem = PorterStemmer()
    
    stemmed_words = []
    
    for w in filtered_words:
        stemmed_words.append(port_stem.stem(w))
        
    print("Filtered Sentence: 
    ", filtered_words, "
    ")
    print("Stemmed Sentence: 
    ", stemmed_words)
    
    Filtered Sentence: 
     ['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.'] 
    
    Stemmed Sentence: 
     ['On', 'wednesday', ',', 'associ', 'comput', 'machineri', ',', 'world', '’', 'largest', 'societi', 'comput', 'profession', ',', 'announc', 'hinton', ',', 'lecun', 'bengio', 'year', '’', 'ture', 'award', 'work', 'neural', 'network', '.', 'the', 'ture', 'award', ',', 'introduc', '1966', ',', 'often', 'call', 'nobel', 'prize', 'comput', ',', 'includ', '$', '1', 'million', 'prize', ',', 'three', 'scientist', 'share', '.']
    

    Lemmatizing

    nltk.download('wordnet')
    
    [nltk_data] Downloading package wordnet to /home/ericwei/nltk_data...
    [nltk_data]   Package wordnet is already up-to-date!
    
    
    
    
    
    True
    
    from nltk.stem.wordnet import WordNetLemmatizer
    
    lem = WordNetLemmatizer()
    
    from nltk.stem.porter import PorterStemmer
    stem = PorterStemmer()
    
    lemm_words = []
    
    for i in range(len(filtered_words)):
        lemm_words.append(lem.lemmatize(filtered_words[i]))
        
    print(lemm_words)
    
    ['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professional', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'network', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientist', 'share', '.']
    

    Parts of Speech Tagging

    nltk.download('averaged_perceptron_tagger')
    
    [nltk_data] Downloading package averaged_perceptron_tagger to
    [nltk_data]     /home/ericwei/nltk_data...
    [nltk_data]   Package averaged_perceptron_tagger is already up-to-
    [nltk_data]       date!
    
    
    
    
    
    True
    
    from nltk import pos_tag
    pos_tagged_words = pos_tag(word_tk)
    
    print(pos_tagged_words)
    
    [('On', 'IN'), ('Wednesday', 'NNP'), (',', ','), ('the', 'DT'), ('Association', 'NNP'), ('for', 'IN'), ('Computing', 'VBG'), ('Machinery', 'NNP'), (',', ','), ('the', 'DT'), ('world', 'NN'), ('’', 'NNP'), ('s', 'RB'), ('largest', 'JJS'), ('society', 'NN'), ('of', 'IN'), ('computing', 'VBG'), ('professionals', 'NNS'), (',', ','), ('announced', 'VBD'), ('that', 'IN'), ('Hinton', 'NNP'), (',', ','), ('LeCun', 'NNP'), ('and', 'CC'), ('Bengio', 'NNP'), ('had', 'VBD'), ('won', 'VBN'), ('this', 'DT'), ('year', 'NN'), ('’', 'VBZ'), ('s', 'JJ'), ('Turing', 'NNP'), ('Award', 'NNP'), ('for', 'IN'), ('their', 'PRP$'), ('work', 'NN'), ('on', 'IN'), ('neural', 'JJ'), ('networks', 'NNS'), ('.', '.'), ('The', 'DT'), ('Turing', 'NNP'), ('Award', 'NNP'), (',', ','), ('which', 'WDT'), ('was', 'VBD'), ('introduced', 'VBN'), ('in', 'IN'), ('1966', 'CD'), (',', ','), ('is', 'VBZ'), ('often', 'RB'), ('called', 'VBN'), ('the', 'DT'), ('Nobel', 'NNP'), ('Prize', 'NNP'), ('of', 'IN'), ('computing', 'NN'), (',', ','), ('and', 'CC'), ('it', 'PRP'), ('includes', 'VBZ'), ('a', 'DT'), ('$', '$'), ('1', 'CD'), ('million', 'CD'), ('prize', 'NN'), (',', ','), ('which', 'WDT'), ('the', 'DT'), ('three', 'CD'), ('scientists', 'NNS'), ('will', 'MD'), ('share', 'NN'), ('.', '.')]
    

    Frequency Distribution Plots

    from nltk.probability import FreqDist
    fd = FreqDist(word_tk)
    print(fd)
    
    <FreqDist with 56 samples and 76 outcomes>
    
    import matplotlib.pyplot as plt
    fd.plot(30, cumulative=False)
    plt.show()
    


    png

    fd_alpha = FreqDist(text)
    print(fd_alpha)
    fd_alpha.plot(30, cumulative=False)
    
    <FreqDist with 41 samples and 387 outcomes>
    

    png

    <AxesSubplot:xlabel='Samples', ylabel='Counts'>
  • 相关阅读:
    vs2013如何在C++中调用Lua(二)
    用vs2013编译lua源码方法(一)
    使用Sublime Text 直接运行Quick-cocos2d-x 项目
    SubmitText 中配置lua 运行环境
    在VS2012/2013上编辑和调试Quick-cocos2d-x的Lua代码
    Cocos2d-x 开发 v3.2 建立新项目并添加库文件
    计算一段函数的执行效率
    C++中嵌入Lua脚本环境搭建
    Android开发环境配置
    cocos2d-x-3.1.1 创建项目
  • 原文地址:https://www.cnblogs.com/keepmoving1113/p/14290104.html
Copyright © 2011-2022 走看看