1. 爬取京东商品评论
JD.py
import requests
from urllib.parse import quote
from urllib.parse import urlencode
from lxml import etree
import logging
import json
import time
class JDSpider:
# 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据
def __init__(self, categlory):
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (quote(categlory)) # jD起始搜索页面
self.commentBaseUrl = "https://club.jd.com/comment/productPageComments.action?"
self.headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
self.productsId = self.getId()
self.comtype = {0: "nagetive", 1: "medium", 2: "positive"}
self.categlory = categlory
self.iplist = {
'http': [],
'https': []
}
def getParamUrl(self, productid, page, score):
# 用于控制页数,页面信息数的数据,非常重要,必不可少,否则会被JD识别出来,爬不出相应的数据。
params = {
"productId" : "%s" % (productid),
"score": "%s" % (score), # 1: 差评, 2: 中评, 3: 好评
"page": "%s" % (page),
"sortType": "5",
"pageSize": "10",
"isShadowSku": "0",
"rid": "0",
"fold": "1"
}
url = self.commentBaseUrl + urlencode(params)
return params, url
# 和初始的self.header不同,爬取某个商品的header,加入了商品id
def getHeaders(self, productid):
header = {
"Referer": "https://item.jd.com/%s.html" % (productid),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
return header
# 获取商品id,为了得到具体商品页面的网址,结果保持在self.productId的数组里
def getId(self):
response = requests.get(self.startUrl, headers=self.headers)
if response.status_code != 200:
logging.warning("状态码错误,爬虫异常!")
html = etree.HTML(response.text)
return html.xpath('//li[@class="gl-item"]/@data-sku')
# maxPage是爬取评论的最大页数,每页10条数据
def getData(self, maxPage, score, ):
# 差评 和 好评 的 最大一般页码不相同,一般情况下:好评 >> 差评 > 中评
# score是指 那种评价类型: 好评3、中评2、差评1
comments = []
scores = []
for j in range(len(self.productsId)):
id = self.productsId[j]
header = self.getHeaders(id)
for i in range(1, maxPage):
param, url = self.getParamUrl(id, i, score)
print(">>>>>>>>>>>>>>>>第:%d 个,第 %d 页" % (j, i))
try:
response = requests.get(url, headers=header, params=param)
except Exception as e:
logging.warning(e)
break
if response.status_code != 200:
logging.warning("状态码错误,爬虫连接异常")
continue
time.sleep(2) # 设置时延
if response.text == '':
logging.warning("未爬取到信息")
continue
try:
res_json = json.loads(response.text)
except Exception as e:
logging.warning(e)
continue
if len((res_json['comments'])) == 0:
logging.warning("页面次数已到:%d,超出范围" % (i))
break
logging.info("正在爬取%s %s 第 %d" % (self.categlory, self.comtype[score], i))
for cdit in res_json['comments']:
comment = cdit['content'].replace("
", ' ').replace('
', ' ')
comments.append(comment)
scores.append(cdit['score'])
print(comment)
savepath = './data/' + self.categlory + '_' + self.comtype[score] + '.csv'
logging.warning("已爬取%d 条 %s 评价信息" % (len(comments), self.comtype[score]))
with open(savepath, 'a+', encoding='utf8') as f:
for i in range(len(comments)):
f.write("%d %s %s
" % (i, scores[i], comments[i]))
logging.warning("数据已保存在 %s" % (savepath))
if __name__=='__main__':
list = ['电脑','手机','耳机']
for item in list:
spider = JDSpider(item)
spider.getData(10, 2) # 好评
spider.getData(10, 1) # 中评
spider.getData(10, 0) # 差评
list列表中是传入的商品类别(如手机、电脑),其中getData的参数是 (maxPage, score)
-
maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评
-
maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。
-
score是指那种评价类型,好评2、中评1、差评0。
运行JD.py,爬取下来的文件存在data/目录下。
之后运行ProcessData.py将原始数据集文件进行划分,按8:1:1的比例划分为 训练集、数据集和测试集,并将划分后的数据集以csv格式存在dataset/目录下。
ProcessData.py
import os
import random
import pandas as pd
data_list = []
file_dir = "data/"
all_csv_list = os.listdir(file_dir)
for single_csv in all_csv_list:
with open(os.path.join(file_dir, single_csv), encoding="utf-8") as file:
for line in file:
label = line.replace('
', '').split(' ')[1]
if (int(label) < 2): # 0: negative
label = 0
elif (int(label) > 4): # 2: positive
label = 2
else:
label = 1
sentence = line.replace('
', '').split(' ')[2]
data_list.append([sentence, label])
random.shuffle(data_list)
# 将全部语料按 1:1:8分为测试集,验证集 与训练集
n = len(data_list) // 10
test_list = data_list[:n]
dev_list = data_list[n : n*2]
train_list = data_list[n*2 : ]
print('训练集数量: {}'.format(str(len(train_list))))
print('验证集数量: {}'.format(str(len(dev_list))))
print('测试集数量: {}'.format(str(len(test_list))))
name = ['Sentence', "Label"]
csv_train = pd.DataFrame(columns=name, data=train_list)
csv_train.to_csv('dataset/csv_train.csv', encoding='utf8', index=False)
csv_dev = pd.DataFrame(columns=name, data=dev_list)
csv_dev.to_csv('dataset/csv_dev.csv', encoding='utf8', index=False)
csv_test = pd.DataFrame(columns=name, data=test_list)
csv_test.to_csv('dataset/csv_test.csv', encoding='utf8', index=False)
2. TorchText处理数据
-
对文本做 sequential、token、use_vocab处理, fix_length的操作,fix_length使用此字段的所有示例都将填充到的固定长度
-
build_vocab 和 划分iter了。
DataSet.py
import torch
from torchtext import data
import jieba
import re
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
def x_tokenize(x):
str1 = re.sub('[^u4e00-u9fa5]', "", x)
return jieba.lcut(str1) # print(x_tokenize('你是谁'))
# sequential: True 是否是序列数据,如果不是就不使用tokenization
# use_vocab: True 是否使用 a Vocab object. 如果不使用的话,原始数据应已是数字类型.
# fix_length: 设置序列数据的定长
# tokenize: string.split 对原始数据进行字符串操作,eg. 输入tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=x_tokenize, fix_length=100,
use_vocab=True)
LABEL = data.Field(sequential=False,
use_vocab=False)
train, dev, test = data.TabularDataset.splits(path='dataset',
train='csv_train.csv',
validation='csv_dev.csv',
test='csv_test.csv',
format='csv',
skip_header=True,
csv_reader_params={'delimiter' : ','},
fields=[('text', TEXT), ('label', LABEL)])
TEXT.build_vocab(train)
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, dev, test),
batch_size = 256,
shuffle = True,
sort = False,
sort_within_batch = False,
repeat = False)
def getTEXT():
return TEXT
def getLabel():
return LABEL
def getIter():
return train_iter, val_iter, test_iter
3. 构建模型
- [x] TextCNN
- [x] TextRNN
- [x] TextRNN+Attention
- [x] Transformer
- [x] TextRCNN
- [ ] Some other attention
模型定义在 model/ 目录下,在forward最后返回的out的shape: [batch size, num_classes]
Transformer.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import sys
sys.path.append('./')
import DataSet
# device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
# Input Embedding: H = e + p
class Position_Encoding(nn.Module):
def __init__(self, embed, pad_size, dropout, device):
super(Position_Encoding, self).__init__()
self.device = device
self.pe = torch.tensor(
# P_(pos, 2i):第pos个位置的编码向量的第2i维
# embed: 编码向量的维度,pos: 表示第pos个位置
[[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)]
)
self.pe[:, 0::2] = torch.sin(self.pe[:, 0::2])
self.pe[:, 1::2] = torch.cos(self.pe[:, 1::2])
self.dropout = nn.Dropout(dropout)
# print(self.pe.shape) # [pad_size, embed]
def forward(self, x):
out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
out = self.dropout(out)
return out # [pad_size, embed]
# 一个Attention
class Scaled_Dot_Product_Attention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self):
super(Scaled_Dot_Product_Attention, self).__init__()
def forward(self, Q, K, V, scale=None, mask=None):
"""
Args:
Q: [batch_size, len_Q, dim_Q]
K: [batch_size, len_K, dim_K]
V: [batch_size, len_V, dim_V]
scale: 缩放因子 论文为根号 dim_X
Return:
self-attention后的张量,以及attention张量
"""
attention = torch.matmul(Q, K.transpose(-2, -1)) # [seq_len, dim_h]·[dim_h, seq_len] = [seq_len, seq_len]
if scale:
attention = attention * scale # attention / math.sqrt(D_k)
if mask:
attention = attention.masked_fill_(mask == 0, -1e9)
attention = F.softmax(attention, dim=-1) # [batch_size, h, seq_len, seq_len]
context = torch.matmul(attention, V) # [batch_size, h, seq_len, dim_head]
return context
# Multi-Head Attention
class Multi_Head_Attention(nn.Module):
def __init__(self, embedding_dim, num_head, dropout=0.0):
super(Multi_Head_Attention, self).__init__()
assert embedding_dim % num_head == 0
# head的数量
self.num_head = num_head
# 将 embedding_dim 分割成 h份的 维度
self.dim_head = embedding_dim // self.num_head
# fc_Q, fc_K, fc_V, fc(最后一层fc)
self.linears = self.clones(nn.Linear(embedding_dim, embedding_dim), 4) # embedding_dim = self.dim_head * self.num_head
self.attention = Scaled_Dot_Product_Attention()
self.dropout = nn.Dropout(p = dropout)
self.layer_norm = nn.LayerNorm(embedding_dim)
def clones(self, module, N = 4):
return nn.ModuleList(copy.deepcopy(module) for _ in range(N))
def forward(self, x, mask = None):
batch_size = x.size(0)
# 1. Do all the linear projections(线性预测) in batch from embeddding_dim => h x d_k
# [batch, seq_len, num_head, dim_head] -> [batch, num_head, seq_len, dim_head]
query, key, value = [l(x).view(batch_size, -1, self.num_head, self.dim_head).transpose(1, 2)
for l in self.linears[:3]]
if mask:
mask = mask.unsqueeze(1) # [batch, seq_len, 1]
scale = key.size(-1) ** -0.5 # 缩放因子, / sqrt(D_k)
# 2. Apply attention on all the projected vectors in batch.
# atten: [batch, num_head, seq_len, dim_head]
context = self.attention(query, key, value, scale=scale, mask=mask)
# [batch, seq_len, emb_dim]
context = context.transpose(1,2).contiguous().view(batch_size, -1, self.dim_head * self.num_head)
out = self.linears[-1](context) # [batch, seq_len, emb_dim]
out = self.dropout(out)
out = out + x # 残差连接
out = self.layer_norm(out) # layerNorm
return out
# Feed Forward + Add + LayerNorm
class Position_wise_Feed_Forward(nn.Module):
def __init__(self, embedding_dim, output_size, dropout=0.0):
super(Position_wise_Feed_Forward, self).__init__()
self.fc1 = nn.Linear(embedding_dim, output_size)
self.fc2 = nn.Linear(output_size, embedding_dim)
self.dropout = nn.Dropout(dropout)
self.layer_norm = nn.LayerNorm(embedding_dim)
def forward(self, x):
out = self.fc1(x)
out = F.relu(out)
out = self.fc2(out)
out = self.dropout(out)
out = out + x # 残差连接
out = self.layer_norm(out) # LayerNorm
return out
# Encoder
class Encoder(nn.Module):
def __init__(self, embedding_dim, num_head, output_size, dropout):
super(Encoder, self).__init__()
self.attention = Multi_Head_Attention(embedding_dim, num_head, dropout)
self.feed_forward = Position_wise_Feed_Forward(embedding_dim, output_size, dropout)
def forward(self, x):
out = self.attention(x)
out = self.feed_forward(out)
return out
# Transformer
class Transformer(nn.Module):
def __init__(self,
vocab_size = len(DataSet.getTEXT().vocab), # 词典的大小
seq_len = 100,
n_class = 3, # 分类的类型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
embed_dim = 300, # embedding的维度
dropout = 0.5,
num_head = 5, # Multi-Head的数量
output_size = 1024,
num_encoder = 2, # 编码数量
):
super(Transformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
# H: e + p
self.postion_embedding = Position_Encoding(embed_dim, seq_len, dropout, device)
# Multi-Head Attention + Add + Norm
self.encoder = Encoder(embed_dim, num_head, output_size, dropout)
self.encoders = nn.ModuleList([
copy.deepcopy(self.encoder) for _ in range(num_encoder)
])
# 输出
self.fc1 = nn.Linear(seq_len * embed_dim, n_class)
def forward(self, x):
out = self.embedding(x)
out = self.postion_embedding(out)
for encoder in self.encoders:
out = encoder(out)
out = out.view(out.size(0), -1) # [batch size, seq_len * embed_dim]
out = self.fc1(out) # [batch size, n_class]
return out
4. 运行模型
train.py
import torch
import torch.nn.functional as F
from torch import nn, optim
import DataSet
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
# from model.TextCNN import TextCNN
# from model.TextRCNN import TextRCNN
# from model.TextRNN import TextRNN
# from model.TextRNN_Attention import TextRNN_Attention
from model.Transformer import Transformer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'transformer'
model = Transformer().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train_iter, dev_iter, test_iter = DataSet.getIter()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def binary_acc(preds, target):
preds = torch.argmax(preds, dim = 1)
correct = (preds == target)
acc = correct.sum().item() / len(correct)
return acc
def train_model(model, train_iter, optimizer, criterion):
# 进入训练模式
model.train()
avg_loss = []
avg_acc = []
print('trainng..............')
for batch in train_iter:
feature = torch.t(batch.text)
target = batch.label
# 预测
pred = model(feature)
# 代价函数
loss = criterion(pred, target)
acc = binary_acc(pred, target)
avg_loss.append(loss.item())
avg_acc.append(acc)
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_acc = np.array(avg_acc).mean()
avg_loss = np.array(avg_loss).mean()
return avg_loss, avg_acc
# 评估函数
def evaluate_model(model, iterator, criterion):
avg_loss = []
avg_acc = []
model.eval() # 表示进入测试
with torch.no_grad():
for batch in iterator:
feature = torch.t(batch.text)
target = batch.label
# 预测
pred = model(feature)
# 代价函数
loss = criterion(pred, target)
acc = binary_acc(pred, target)
avg_loss.append(loss.item())
avg_acc.append(acc)
avg_loss = np.array(avg_loss).mean()
avg_acc = np.array(avg_acc).mean()
return avg_loss, avg_acc
def test_model(model, iterator, criterion):
avg_loss = []
avg_acc = []
model.eval() # 表示进入测试
y_true = []
y_pred = []
with torch.no_grad():
for batch in iterator:
feature = torch.t(batch.text)
target = batch.label
# 预测
pred = model(feature)
# 代价函数
loss = criterion(pred, target)
acc = binary_acc(pred, target)
avg_loss.append(loss.item())
avg_acc.append(acc)
y_true.extend(target.cpu().numpy())
y_pred.extend(torch.argmax(pred, dim = 1).cpu().numpy())
avg_loss = np.array(avg_loss).mean()
avg_acc = np.array(avg_acc).mean()
score = accuracy_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
print(conf_matrix)
target_names = ['差评', '中评', '好评']
print(classification_report(y_true, y_pred, target_names=target_names))
return avg_loss, avg_acc
def saveModel(model, name):
torch.save(model.state_dict(), 'done_model/' + name + '_model.pt')
def loadModel(model, name):
model.load_state_dict(torch.load('done_model/' + name + '_model.pt', map_location=device))
def train():
best_valid_acc = float('-inf')
for epoch in range(10):
start_time = time.time()
train_loss, train_acc = train_model(model, train_iter, optimizer, criterion)
dev_loss, dev_acc = evaluate_model(model, dev_iter, criterion)
end_time = time.time()
epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
if dev_acc > best_valid_acc: # 只要模型效果好,就保存
best_valid_acc = dev_acc
saveModel(model, MODEL_NAME)
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs:.2f}s')
print(f' Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f' Val. Loss: {dev_loss:.3f} | Val. Acc: {dev_acc*100:.2f}%')
def test():
loadModel(model, MODEL_NAME)
test_loss, test_acc = test_model(model, test_iter, criterion)
print(f'Test. Loss: {test_loss:.3f} | Test. Acc: {test_acc*100:.2f}%')
def predict():
sent1 = '垃圾,这个东西最好别买'
demo = [data.Example.fromlist(data=[sent1,0],
fields=[('text', DataSet.getTEXT()), ('label',DataSet.getLabel())])]
demo_iter = data.BucketIterator(dataset = data.Dataset(demo,
[('text',DataSet.getTEXT()), ('label',DataSet.getLabel())]),
batch_size = 256,
shuffle = True,
sort_key = lambda x:len(x.text),
sort_within_batch = False,
device = device,
repeat = False)
for batch in demo_iter:
feature = batch.text
target = batch.label
feature = torch.t(feature)
out = model(feature)
if torch.argmax(out, dim=1).item() == 0:
print('差评')
elif torch.argmax(out, dim=1).item() == 2:
print('好评')
else:
print('中评')
if __name__=='__main__':
train()
test()
predict()
trainng..............
Epoch: 01 | Epoch Time: 0.0m 20.97s
Train Loss: 1.593 | Train Acc: 62.34%
Val. Loss: 0.663 | Val. Acc: 75.34%
trainng..............
Epoch: 02 | Epoch Time: 0.0m 20.81s
Train Loss: 0.582 | Train Acc: 74.62%
Val. Loss: 0.562 | Val. Acc: 80.54%
trainng..............
Epoch: 03 | Epoch Time: 0.0m 20.82s
Train Loss: 0.523 | Train Acc: 77.62%
Val. Loss: 0.465 | Val. Acc: 82.23%
trainng..............
Epoch: 04 | Epoch Time: 0.0m 20.81s
Train Loss: 0.480 | Train Acc: 79.32%
Val. Loss: 0.529 | Val. Acc: 81.80%
trainng..............
Epoch: 05 | Epoch Time: 0.0m 20.84s
Train Loss: 0.490 | Train Acc: 79.36%
Val. Loss: 0.461 | Val. Acc: 81.96%
trainng..............
Epoch: 06 | Epoch Time: 0.0m 20.77s
Train Loss: 0.427 | Train Acc: 81.62%
Val. Loss: 0.442 | Val. Acc: 82.23%
trainng..............
Epoch: 07 | Epoch Time: 0.0m 20.79s
Train Loss: 0.416 | Train Acc: 82.37%
Val. Loss: 0.491 | Val. Acc: 81.87%
trainng..............
Epoch: 08 | Epoch Time: 0.0m 20.82s
Train Loss: 0.372 | Train Acc: 83.98%
Val. Loss: 0.447 | Val. Acc: 83.88%
trainng..............
Epoch: 09 | Epoch Time: 0.0m 20.83s
Train Loss: 0.364 | Train Acc: 84.26%
Val. Loss: 0.433 | Val. Acc: 85.02%
trainng..............
Epoch: 10 | Epoch Time: 0.0m 20.79s
Train Loss: 0.346 | Train Acc: 84.89%
Val. Loss: 0.483 | Val. Acc: 82.18%
[[591 95 9]
[192 460 61]
[ 4 19 731]]
precision recall f1-score support
差评 0.75 0.85 0.80 695
中评 0.80 0.65 0.71 713
好评 0.91 0.97 0.94 754
accuracy 0.82 2162
macro avg 0.82 0.82 0.82 2162
weighted avg 0.82 0.82 0.82 2162
Test. Loss: 0.528 | Test. Acc: 82.37%
差评