1.商品爬取
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-02-02 08:59:40
# Project: oneDrug
from pyspider.libs.base_handler import *
from pymongo import MongoClient
import re
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017')
self.drug = self.client.drug
def insert_goods(self, data):
collection = self.drug['goods']
collection.update({'goods_id': data['goods_id']}, data, True)
def insert_comments(self, data):
collection = self.drug['comments']
collection.insert_one(data)
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.111.com.cn/categories/', callback=self.categories_page, validate_cert=False,
fetch_type='js')
@config(age=10 * 24 * 60 * 60)
def categories_page(self, response):
for each in response.doc('.allsort em > a').items():
self.crawl(each.attr.href, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')
@config(priority=1)
def cagetory_list_page(self, response):
for each in response.doc('#itemSearchList a[target="_blank"][class="product_pic pro_img"]').items():
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, fetch_type='js')
next = response.doc('#search_table > div.turnPageBottom > a.page_next').attr.href
self.crawl(next, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')
@config(priority=2)
def detail_page(self, response):
goods_id = response.doc('#gallery_view > ul > li.item_number').text()
cagetory_one = response.doc('body > div.wrap.clearfix > div > span:nth-child(3) > a').text()
cagetory_two = response.doc('body > div.wrap.clearfix > div > span:nth-child(5) > a').text()
cagetory_three = response.doc('body > div.wrap.clearfix > div > span:nth-child(7) > a').text()
merchants = response.doc('div.middle_property > span:nth-child(1)').text()
goods_name = response.doc('div.middle_property > h1').text()
goods_desc = response.doc('div.middle_property > span.red.giftRed').text()
goods_price = response.doc(
'div.middle_property > div.shangpin_info > dl:nth-child(2) > dd > span.good_price').text()
total_comments = response.doc('#fristReviewCount > span > a').text()
brand = response.doc(
'#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(2)').text()
spec = response.doc(
'#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(4)').text()
weight = response.doc(
'#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(2)').text()
manufacturers = response.doc(
'#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(4)').text()
approval_number = response.doc(
'#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(2)').text()
drug_type = response.doc(
'#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(4)').text()
instructions = {}
if response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child(1) > th').text():
for i in range(3, 22):
instructions_key =
response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > th'.format(i)).text().split(
" ")[0]
instructions_value = response.doc(
'#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > td'.format(i)).text()
instructions[instructions_key] = instructions_value
total_comments = response.doc('#itemComments > span').text()
good_comments = response.doc('#productExperience > div > ul > li:nth-child(2) > a > span').text()
mid_comments = response.doc('#productExperience > div > ul > li:nth-child(3) > a > span').text()
bad_comments = response.doc('#productExperience > div > ul > li:nth-child(4) > a > span').text()
url_id = re.findall('d+', response.url)[1]
goods_data = {
'url_id': url_id,
'goods_id': goods_id,
'goods_name': goods_name,
'goods_desc': goods_desc,
'goods_price': goods_price,
'merchants': merchants,
'cagetory': {
'1': cagetory_one,
'2': cagetory_two,
'3': cagetory_three
},
'drug_detail': {
'brand': brand,
'spec': spec,
'weight': weight,
'manufacturers': manufacturers,
'approval_number': approval_number,
'drug_type': drug_type
},
'instructions': instructions,
'comments': {
'total_comments': total_comments,
'good_comments': good_comments,
'mid_comments': mid_comments,
'bad_comments': bad_comments
}
}
self.insert_goods(goods_data)
2.评论爬取
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
import re
import socket
class Drug:
def __init__(self):
self.clint = MongoClient('mongodb://localhost:27017')
self.drug = self.clint.drug
self.collection = self.drug['goods']
self.comm_collection = self.drug['comments']
def dbmodify(self):
for data in self.collection.find({},{"goods_id":1,"goods_price":1}):
try:
_id = data['_id']
id = data['goods_id'].split(":")[1]
price = data['goods_price'].split("¥")[1]
self.collection.update({'_id': _id},{'$set':{'goods_id':id,'goods_price':price}})
print(_id, id, price)
except IndexError:
pass
def getBaseArgument(self,goods_id):
base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'
data = {
'goodsId': goods_id,
'pageIndex': 1,
'score': '1&_19020301'
}
try:
self.collection.update_one({'url_id': goods_id}, {'$set': {'commspider': True}})
requests.packages.urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 5
# 设置连接活跃状态为False
s = requests.session()
s.keep_alive = False
r = s.get(base_url, params=data, timeout = 5,verify=False)
r.close()
soup = BeautifulSoup(r.text, 'html.parser')
if soup.find_all("div", class_="view_no_result"):
return "No Comments!"
else:
total_page_text = soup.find_all(text=re.compile(r'共d+页'))[0]
pattern = re.compile(r'd+')
total_page = pattern.findall(total_page_text)
return total_page[0]
except requests.exceptions.RequestException as e:
print(e)
def getCommlist(self,goods_id, total_page):
base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'
try:
for i in range(1, int(total_page)):
data = {
'goodsId': goods_id,
'pageIndex': i,
'score': '1&_19020301'
}
try:
requests.packages.urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 15
# 设置连接活跃状态为False
s = requests.session()
s.keep_alive = False
r = s.get(base_url, params=data, timeout = 5,verify=False)
r.close()
soup = BeautifulSoup(r.text, 'html.parser')
for tr in soup.find_all("tr"):
comments = {}
try:
comments['goodsId'] = goods_id
comments['content'] = tr.find('p').text.strip()
comments['date'] = tr.find('p', attrs={'class': 'eval_date'}).text.strip()
self.comm_collection.insert_one(comments)
except:
print(goods_id + "Have some problem!
")
print(comments)
except requests.exceptions.RequestException as e:
print(e)
except ValueError:
return "No Comments! Try next!"
def getComments(self):
i = 0
goods_list = []
for data in self.collection.find({'commspider': False}, {"url_id"}):
id = data['url_id']
goods_list.append(id)
length = len(goods_list)
print("总共 {} 条商品".format(length))
for good in goods_list:
total_page = self.getBaseArgument(good)
comments = self.getCommlist(good,total_page)
i = i + 1
print("总共 {} 条商品
目前第 {} 条
商品编号 {}
".format(length,i, good))
print(comments)
test = Drug().getComments()