zoukankan      html  css  js  c++  java
  • python机器学习,载入样本集,对数据分类

    import pandas,numpy,os,nltk,langid
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.naive_bayes import MultinomialNB
    
    #preprocess用于将一个文本文档进行切词,并以字符串形式输出切词结果
    def preprocess(path_name):
        text_with_spaces=""
        textfile=open(path_name,"r",encoding="utf-8").read()
        textcut=nltk.word_tokenize(textfile)
        for word in textcut:
            text_with_spaces+=word+" "
        return text_with_spaces
    
    
    #loadtrainset用于将某一文件夹下的所有文本文档批量切词后,载入为训练数据集;返回训练集和每一个文本(元组)对应的类标号。
    def loadtrainset(path,classtag):
        allfiles=os.listdir(path)
        processed_textset=[]
        allclasstags=[]
        for thisfile in allfiles:
            path_name=path+"/"+thisfile
            processed_textset.append(preprocess(path_name))
            allclasstags.append(classtag)
        return processed_textset,allclasstags
    
    
    def train():
        processed_textdata1,class1=loadtrainset("data/CS", "CS")
        processed_textdata2,class2=loadtrainset("data/CL", "CL")
        integrated_train_data=processed_textdata1+processed_textdata2
        classtags_list=class1+class2
    
    
        count_vector = CountVectorizer()
        #该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
        vector_matrix = count_vector.fit_transform(integrated_train_data)
    
        #tfidf度量模型
        train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vector_matrix)
        #将词频矩阵转化为权重矩阵,每一个特征值就是一个单词的TF-IDF值
    
    
        #调用MultinomialNB分类器进行训练
        clf = MultinomialNB().fit(train_tfidf,classtags_list)#
    
        return count_vector,clf
    
    
    def isCyber(content):
        #[CL,CS]
        content_lang = langid.classify(content)[0]
        if  content_lang == 'en':
            text_with_spaces=""
            textcut=nltk.word_tokenize(content)
            for word in textcut:
                text_with_spaces+=word+" "
    
            testset=[]
            testset.append(text_with_spaces)
            count_vector,clf = train()
            new_count_vector = count_vector.transform(testset)
            new_tfidf= TfidfTransformer(use_idf=False).fit_transform(new_count_vector)
            predict_result = clf.predict(new_tfidf)    #预测结果
            print(predict_result)
            print( clf.predict_proba(new_tfidf) )
            print( clf.predict_proba(new_tfidf)[0][1] )
            if predict_result[0] == 'CS':
                if clf.predict_proba(new_tfidf)[0][1] >= 0.7:
                    return True
            return False
        if content_lang == 'zh':
            print()
    
    if __name__=='__main__':
        content = '''These pandemic days flow by in waves of exhilaration and stillness. Who knew a trip to the grocery store could be so exciting? Bread-and-milk runs have become surgical raids: Sterilize the grocery cart with a disinfectant wipe, scout out the TP aisle, exchange sideways glances with the could-be infected, grab the essentials, and get the hell out of there. Later, as another news alert interrupts the Netflix stream, the group text explodes: “This is crazy,” everyone says from their respective couches. Few hasten to add that crazy is also sort of fun.'''
        isCyber(content)
  • 相关阅读:
    TextView 高亮
    Android 学习 第一章(环境搭建)
    从assets res 中读文件
    动态设置imageview 宽高
    android 算定义view 打包 jar(一次开发多次使用)
    Activity 跳转
    Android手机在开发调试时logcat不显示输出信息的解决办法
    弹出对话 AlertDialog 有按钮
    让划动 listview时 没有黑色背景
    Activity Service 数据相互操作
  • 原文地址:https://www.cnblogs.com/lxz123/p/14932053.html
Copyright © 2011-2022 走看看