#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-04-07 08:14:57
# Project: tripadvisor
from pyspider.libs.base_handler import *
import pymongo
class Handler(BaseHandler):
crawl_config = {
}
client = pymongo.MongoClient('localhost')
db = client['trip']
@every(minutes=24 * 60)
def on_start(self):
#在crawl处填入URL
self.crawl('https://www.tripadvisor.cn/Attraction_Review-g187147-d188150-Reviews-Musee_d_Orsay-Paris_Ile_de_France.html', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('html').items():
self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
name = response.doc('.shelf_row_4 .name > a').text()
num = response.doc('.shelf_row_4 .review_count').text()
address = response.doc('.adjust > .title').text()
return {
"url": response.url,
"name" : name,
"num" : num,
"address":address,
"title": response.doc('title').text(),
}
def on_result(self,result):
if result:
self.save_to_mongo(result)
def save_to_mongo(self,result):
if self.db['paris'].insert(result):
print("save to mongo",result)
结果: