zoukankan      html  css  js  c++  java
  • nltk_28Twitter情感分析模型

    sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

    生产Twitter情感分析的模型,并保存数据为pickle,此过程可能要一个小时,所以下次调用数据就很简单了

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jan 12 10:44:19 2017
    
    @author: Administrator
    
    用于短评论分析-- Twitter
    
    保存后的"positive.txt","negative.txt"需要转码为utf-8
    在线转码网址
    http://www.esk365.com/tools/GB2312-UTF8.asp
    
    
    features=5000,准确率百分之60以上
    features=10000,准确率百分之 以上
    
    运行时间可能长达一个小时
    """
    
    import nltk
    import random
    import pickle
    from nltk.tokenize import word_tokenize
            
    short_pos = open("positive.txt","r").read()
    short_neg = open("negative.txt","r").read()
    
    # move this up here
    documents = []
    all_words = []
    
    for r in short_pos.split('
    '):
        documents.append( (r, "pos") )
    
    for r in short_neg.split('
    '):
        documents.append( (r, "neg") )
    
    
    #  j is adject, r is adverb, and v is verb
    #allowed_word_types = ["J","R","V"] 允许形容词类别
    allowed_word_types = ["J"]
    
    
    for p in short_pos.split('
    '):
        documents.append( (p, "pos") )
        words = word_tokenize(p)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    
    
    for p in short_neg.split('
    '):
        documents.append( (p, "neg") )
        words = word_tokenize(p)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    
    #保存文档            
    save_documents = open("pickled_algos/documents.pickle","wb")
    pickle.dump(documents, save_documents)
    save_documents.close()           
    
    
    #保存特征
    all_words = nltk.FreqDist(all_words)
    #最好改成2万以上
    word_features = list(all_words.keys())[:5000]
    save_word_features = open("pickled_algos/word_features5k.pickle","wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()
    
    
    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
    
        return features
    
    featuresets = [(find_features(rev), category) for (rev, category) in documents]
    
    random.shuffle(featuresets)
    print(len(featuresets))
    
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    
    
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
    classifier.show_most_informative_features(15)
    
    #保存分类器
    save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()
    
     sentiment_mod.py
    # -*- coding: utf-8 -*-
    """
    Created on Thu Jan 12 16:47:51 2017
    
    @author: Administrator
    """
    
    #File: sentiment_mod.py
    
    import nltk
    import random
    import pickle
    from nltk.tokenize import word_tokenize
    
    documents_f = open("pickled_algos/documents.pickle", "rb")
    documents = pickle.load(documents_f)
    documents_f.close()
    
    
    word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")
    word_features = pickle.load(word_features5k_f)
    word_features5k_f.close()
    
    
    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
    
        return features
    
    
    featuresets_f = open("pickled_algos/featuresets.pickle", "rb")
    featuresets = pickle.load(featuresets_f)
    featuresets_f.close()
    
    random.shuffle(featuresets)
    print(len(featuresets))
    
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    
    
    open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")
    classifier = pickle.load(open_file)
    open_file.close()
    
    
    
    def sentiment(text):
        feats = find_features(text)
        return classifier.classify(feats)
    

    测试

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jan 12 16:50:12 2017
    
    @author: Administrator
    """
    
    import sentiment_mod as s
    
    print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
    print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))
    

    python风控评分卡建模和风控常识(博客主亲自录制视频教程)

  • 相关阅读:
    uni-app 轮播图
    uni-app 头部及底部导航
    Eapp 创建项目及简单应用
    nodeJS学习笔记 express获得GET和POST请求参数
    Promise 之基础详细介绍
    动态代理个人理解
    springboot实现日志记录
    调节 alert confirm prompt 的位置
    打印日志文件
    Java正则速成秘籍(三)之见招拆招篇
  • 原文地址:https://www.cnblogs.com/webRobot/p/6278919.html
Copyright © 2011-2022 走看看