zoukankan      html  css  js  c++  java
  • sklearn——CountVectorizer详解

    关于sklearn——CountVectorizer的一篇详细讲解

    https://blog.csdn.net/weixin_38278334/article/details/82320307

    使用Keras进行设计全连接层进行文本分类

     1 #搭建一个全连接层神经网络进行文本情感分类的demo
     2 import pandas as pd
     3 df = pd.read_csv('train_comment_small_50.csv',sep=',')
     4 import re
     5 def clean_comment(text):
     6     text = re.sub('<[^<]+?>',' ',text)#只要是”^”这个字符是在中括号”[]”中被使用的话就是表示字符类的否定,如果不是的话就是表示限定开头。
     7     text = text.replace('\"','')
     8     text = text.replace('"','')
     9     return text
    10 df['cleaned_comment'] = df['comment_text'].apply(clean_comment)
    11 from sklearn.model_selection import train_test_split
    12 X_train,X_test,y_train,y_test = train_test_split(df['cleaned_comment'],df['toxic'],test_size=0.2)
    13 import nltk
    14 from sklearn.feature_extraction.text import CountVectorizer
    15 from nltk.corpus import stopwords
    16 vectorizer = CountVectorizer(binary=True,stop_words= stopwords.words('english'),lowercase=True,min_df=3,max_df=0.9,max_features=5000)
    17 X_train_onehot = vectorizer.fit_transform(X_train)
    18 import numpy as np
    19 np.set_printoptions(threshold=np.inf)
    20 print(X_train_onehot)
    21 #print(X_train_onehot.t
    22 from tensorflow.keras.models import Sequential
    23 from tensorflow.keras.layers import Dense
    24 nn = Sequential()
    25 print(vectorizer.get_feature_names())
    26 nn.add(Dense(units=500,activation='relu',input_dim=len(vectorizer.get_feature_names())))
    27 nn.add(Dense(units=1,activation='sigmoid'))
    28 nn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    29 nn.summary()
    30 nn.fit(X_train_onehot[:-20],y_train[:-20],epochs=5,batch_size=128,verbose=1,validation_data=(X_train_onehot[-100:],y_train[-100:]))
    31 socres = nn.evaluate(vectorizer.transform(X_test),y_test,verbose=1)
    32 print('accuracy:',socres[1])
    33 nn.save('nn.hd5')

    使用CNN对文本进行分类

    #简单的CNN对路透社新闻主题的分类应用
    
    import numpy as np
    import keras
    from keras.datasets import reuters
    from keras.preprocessing.text import Tokenizer
    from tensorflow.keras.models import Sequential
    from tensorflow.keras import layers
    batch_size = 32
    epochs = 12
    maxlen =10000
    embedding_dim = 128
    num_filters = 64
    kernel_size  =5
    (x_train,y_train),(x_test,y_test)= reuters.load_data(num_words=None,test_split=0.2)
    print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)
    word_index = reuters.get_word_index(path="return_word_index.json")
    print(np.array(word_index))
    num_classes = max(y_train)+1
    index_to_word = {}
    for key, value in word_index.items():
        print(key,value)
        index_to_word[value] = key
    tokenizer = Tokenizer(num_words=maxlen)
    x_train = tokenizer.sequences_to_matrix(x_train,mode='binary')
    print(x_train)
    x_test = tokenizer.sequences_to_matrix(x_test,mode='binary')
    y_train = keras.utils.to_categorical(y_train,num_classes)#将整型的类别标签转为onehot编码
    y_test = keras.utils.to_categorical(y_test,num_classes)
    
    model = Sequential()
    model.add(layers.Embedding(512,embedding_dim,input_length=maxlen))
    model.add(layers.Conv1D(num_filters,kernel_size,activation='relu'))
    model.add(layers.GlobalAveragePooling1D)
    model.add(layers.Dense(10,activation='relu'))
    model.add(layers.Dense(num_classes,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    history = model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)
    score = model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1)
    print("loss=",  score[0])
    print("accuracy=", score[1])
  • 相关阅读:
    Azure 虚拟机安全加固整理
    AzureARM 使用 powershell 扩容系统磁盘大小
    Azure Linux 云主机使用Root超级用户登录
    Open edX 配置 O365 SMTP
    powershell 根据错误GUID查寻错误详情
    azure 创建redhat镜像帮助
    Azure Powershell blob中指定的vhd创建虚拟机
    Azure Powershell 获取可用镜像 PublisherName,Offer,Skus,Version
    Power BI 连接到 Azure 账单,自动生成报表,可刷新
    Azure powershell 获取 vmSize 可用列表的命令
  • 原文地址:https://www.cnblogs.com/henuliulei/p/13742269.html
Copyright © 2011-2022 走看看