zoukankan      html  css  js  c++  java
  • 11.29作业


    text = '"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."'

    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    nltk.download()

    #预处理
    def preprocessing(text):
    #text = text.decode("utf-8)
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stops = stopwords.words('english')
    tokens = [token for token in tokens if token not in stops]

    tokens = [token.lower() for token in tokens if len(token)>=3]
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lenmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

    preprocessing(text)

    import csv #用csv读取邮件数据,分解出邮件类别及邮件内容
    file_path = r'C:UsersAdministratorDesktopSMSSpamCollectionjsn.txt'
    sms = open(file_path,'r',encoding = 'utf-8')
    sms_data = []
    sms_label = []
    csv_reader = csv.reader(sms,delimiter=' ')
    for line in csv_reader:
    sms_label.append(line[0])
    sms_data.append(processing[1])
    sms.close()
    sms_label
    sms_data


    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(sms_data,sms_label,test_size=0.3,random_state=0,stratify=sms_label) #训练集,测试集

    #将其向量化
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(min_df = 2,ngram_range=(1,2),stop_words='english',strip_accents='unicode',norm='l2')
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)

    #朴素贝叶斯分类器
    from sklearn.naive_bayes import MultinomialNB
    clf = MultinomialNB().fit(x_train,y_train)
    y_nb_pred = clf.predict(x_test)

    #分类结果显示
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    print(y_nb_pred.shape,y_nb_pred) #x_text预测结果
    print('nb_confusion_matrix:')
    cm = confusion.matrix(y_test,y_nb_pred) #混淆矩阵
    print(cm)
    print('nb_classification_report')
    cr = classification_report(y_test,y_nb_pred) #主要分类指标的文本报告
    print(cr)

    feature_name = vectorizer.get_feature_name() #出现过的单词列表
    coefs = clf.coef_ #先验证概率
    intercept = clf.intercept_
    coef_with_fns = sorted(zip(coefs[0],feature_names)) #对数概率p(x_i)y与单词x_i映射

    n=10
    top = zip(coefs_with_fns[:n],coefs_with_fns[:(n+1):-1])
    for(coef_1,fn_1),(coef_2,fn_2) in top:
    print('')

    text='"As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"'

    import nltk #nltk进行分词
    for sent in nltk.sent_tokenize(text): #对文本按照句子进行分割
    for word in nltk.word_tokenize(sent): #对句子进行分词
    print(word)
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    from nltk.corpus import stopwords #去掉停用词
    stops = stopwords.words('english')
    stops

    tokens = [token for token in tokens if token not in stops]
    s = set(tokens)-set(stops)
    print(len(tokens),len(set(tokens)),len(s))

    # nltk.download('wordnet')
    from nltk.stem import WordNetLemmatizer #词性还原
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize('leavers')

    import csv
    # 读数据
    file_path = r'EmailData.txt'
    EmailData = open(file_path,'r',encoding='utf-8')
    Email_data = []
    Email_target = []
    csv_reader = csv.reader(EmailData,delimiter='	')
    # 将数据分别存入数据列表和目标分类列表
    for line in csv_reader:
        Email_data.append(line[1])
        Email_target.append(line[0])
    EmailData.close()
    
    # 把无意义的符号都替换成空格
    Email_data_clear = []
    for line in Email_data:
        # line :'Go until jurong point, crazy.. Available only in bugis n great world la e buffet...'
        # 每一行都去掉无意义符号并按空格分词
        for char in line:
            if char.isalpha() is False:
                # 不是字母,发生替换操作:
                newString = line.replace(char," ")
        tempList = newString.split(" ")
        # 将处理好后的一行数据追加到存放干净数据的列表
        Email_data_clear.append(tempList)
    
    # 去掉长度不大于3的词和没有语义的词
    Email_data_clear2 = []
    for line in Email_data_clear:
        tempList = []
        for word in line:
            if word != '' and len(word) > 3 and word.isalpha():
                tempList.append(word)
        tempString = ' '.join(tempList)
        Email_data_clear2.append(tempString)
    Email_data_clear = Email_data_clear2
    
    # 将数据分为训练集和测试集
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(Email_data_clear2,Email_target,test_size=0.3,random_state=0,stratify=Email_target)
    
    # 建立数据的特征向量
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(x_train)
    X_test = tfidf.transform(x_test)
    
    # 观察向量
    import numpy as np
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    X_train.shape
    # 输出不为0的列
    for i in range(X_train.shape[0]):
        for j in range(X_train.shape[1]):
            if X_train[i][j] != 0:
                print(i,j,X_train[i][j])
    
    # 建立模型
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    module = gnb.fit(X_train,y_train)
    y_predict = module.predict(X_test)
    
    # 输出模型分类的各个指标
    from sklearn.metrics import classification_report
    cr = classification_report(y_predict,y_test)
    print(cr)
  • 相关阅读:
    [linux] unzip把压缩文件解压到指定目录下 d
    在线客服系统全渠道接入不限制坐席、域名、服务器GOFLY在线客服系统
    [html] link 标签 rel=canonical 属性的用法
    [javascript] 获取当前时间日期和时间戳
    [css] 加载本地文件出现跨域Cross origin requests are only supported for protocol schemes: http, data, chrome, chromeextension, chromeuntrusted, https.
    [git] git stash 暂存未提交的修改文件
    [MySQL系列] mysql find_in_set搜索以逗号分隔的字符串
    [Linux系列] Vim命令清空文件删除所有内容
    [Golang系列] gorm执行like模糊查询
    [Linux系列] 在线客服系统代码脚本getconf LONG_BIT获取当前系统的位数
  • 原文地址:https://www.cnblogs.com/Tlzlykc/p/10037346.html
Copyright © 2011-2022 走看看