zoukankan      html  css  js  c++  java
  • pyspider爬取TripAdvisor

     1 #!/usr/bin/env python
     2 # -*- encoding: utf-8 -*-
     3 # Created on 2017-06-11 10:10:53
     4 # Project: london
     5 
     6 from pyspider.libs.base_handler import *
     7 import pymongo
     8 
     9 
    10 class Handler(BaseHandler):
    11     crawl_config = {
    12     }
    13     client = pymongo.MongoClient('localhost')
    14     db = client['trip']
    15 
    16     @every(minutes=24 * 60)
    17     def on_start(self):
    18         self.crawl('https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-London_England.html', callback=self.index_page)
    19 
    20     @config(age=10 * 24 * 60 * 60)
    21     def index_page(self, response):
    22         for each in response.doc('.listing_title > a').items():
    23             self.crawl(each.attr.href, callback=self.detail_page)
    24         next_page = response.doc('.pagination .nav.next').attr.href
    25         self.crawl(next_page,callback = self.index_page)
    26 
    27     @config(priority=2)
    28     def detail_page(self, response):
    29         return {
    30             "name":response.doc('h1').text(),
    31             "url": response.url,
    32             'comment':response.doc('.heading_ratings .taLnk').text(),
    33             'address':response.doc('.addressReset > span.format_address').text(),
    34             'phone':response.doc('.phoneNumber').text(),
    35             'duration':response.doc('#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(5) > div > div:nth-child(1)').text(),
    36             'instruction':response.doc('#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(6) > div > b').text()
    37         }
    38     def on_result(self,result):
    39         if result:
    40             self.save_to_mongo(result)
    41             
    42     def save_to_mongo(self,result):
    43         if self.db['london'].insert(result):
    44             print('saved to mongo',result)
    45     
  • 相关阅读:
    UML类图与类的关系详解
    hadoop中的Partition
    几种排序
    poj 1006
    Hadoop namenode无法启动
    String中intern的方法
    java
    模板方法模式
    里氏替换原则
    按字节数截取字符串
  • 原文地址:https://www.cnblogs.com/themost/p/6985282.html
Copyright © 2011-2022 走看看