zoukankan html css js c++ java

python机器学习，载入样本集，对数据分类

import pandas,numpy,os,nltk,langid
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#preprocess用于将一个文本文档进行切词，并以字符串形式输出切词结果
def preprocess(path_name):
    text_with_spaces=""
    textfile=open(path_name,"r",encoding="utf-8").read()
    textcut=nltk.word_tokenize(textfile)
    for word in textcut:
        text_with_spaces+=word+" "
    return text_with_spaces


#loadtrainset用于将某一文件夹下的所有文本文档批量切词后，载入为训练数据集；返回训练集和每一个文本（元组）对应的类标号。
def loadtrainset(path,classtag):
    allfiles=os.listdir(path)
    processed_textset=[]
    allclasstags=[]
    for thisfile in allfiles:
        path_name=path+"/"+thisfile
        processed_textset.append(preprocess(path_name))
        allclasstags.append(classtag)
    return processed_textset,allclasstags


def train():
    processed_textdata1,class1=loadtrainset("data/CS", "CS")
    processed_textdata2,class2=loadtrainset("data/CL", "CL")
    integrated_train_data=processed_textdata1+processed_textdata2
    classtags_list=class1+class2


    count_vector = CountVectorizer()
    #该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
    vector_matrix = count_vector.fit_transform(integrated_train_data)

    #tfidf度量模型
    train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vector_matrix)
    #将词频矩阵转化为权重矩阵,每一个特征值就是一个单词的TF-IDF值


    #调用MultinomialNB分类器进行训练
    clf = MultinomialNB().fit(train_tfidf,classtags_list)#

    return count_vector,clf


def isCyber(content):
    #[CL,CS]
    content_lang = langid.classify(content)[0]
    if  content_lang == 'en':
        text_with_spaces=""
        textcut=nltk.word_tokenize(content)
        for word in textcut:
            text_with_spaces+=word+" "

        testset=[]
        testset.append(text_with_spaces)
        count_vector,clf = train()
        new_count_vector = count_vector.transform(testset)
        new_tfidf= TfidfTransformer(use_idf=False).fit_transform(new_count_vector)
        predict_result = clf.predict(new_tfidf)    #预测结果
        print(predict_result)
        print( clf.predict_proba(new_tfidf) )
        print( clf.predict_proba(new_tfidf)[0][1] )
        if predict_result[0] == 'CS':
            if clf.predict_proba(new_tfidf)[0][1] >= 0.7:
                return True
        return False
    if content_lang == 'zh':
        print()

if __name__=='__main__':
    content = '''These pandemic days flow by in waves of exhilaration and stillness. Who knew a trip to the grocery store could be so exciting? Bread-and-milk runs have become surgical raids: Sterilize the grocery cart with a disinfectant wipe, scout out the TP aisle, exchange sideways glances with the could-be infected, grab the essentials, and get the hell out of there. Later, as another news alert interrupts the Netflix stream, the group text explodes: “This is crazy,” everyone says from their respective couches. Few hasten to add that crazy is also sort of fun.'''
    isCyber(content)

查看全文

相关阅读:
#454. 【UER #8】打雪仗
 6496. 【GDOI2020模拟03.08】圣痕
 6495. 【GDOI2020模拟03.08】死星
 6494. 【GDOI2020模拟03.08】勘探
 NOI Online划水记
 6482. 【GDOI2020模拟02.22】代数几何(algebraic)
6493. 【GDOI2020模拟03.04】迷宫
 6492. 【GDOI2020模拟03.04】多项式
 6491. 【GDOI2020模拟03.04】铺路
 #76. 【UR #6】懒癌

原文地址：https://www.cnblogs.com/lxz123/p/14932053.html