大多数在主体内容放在js代码里,想要爬取网页的整个内容
在scrapy里面应用selenium
wangyiPro.py 通过spider传递
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from handle5 import items class WangyiproSpider(scrapy.Spider): name = 'wangyiPro' # allowed_domains = ['www.xxx.com'] start_urls = ['https://news.163.com/'] def __init__(self): self.browser = webdriver.Chrome(r"D:chromedriver_winchromedriver")
middleware.py 里面加入selenium操作
class Handle5DownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # print("这是下载中间件", request.url) # Must either; url_list = ['http://news.163.com/world/', 'http://news.163.com/domestic/', 'http://war.163.com/', 'http://news.163.com/air/'] # print("url", request.url) if request.url in url_list: # print(1) spider.browser.get(url=request.url) import time time.sleep(2) page_text = spider.browser.page_source # print("page_text", page_text) # time.sleep(2) return HtmlResponse(url=spider.browser.current_url, body=page_text, encoding='utf-8', request=request) # - return a Response object # - return a Request object # - or raise IgnoreRequest return response