基于requests 模块
#动态加载的js的数据 import requests import re from lxml import etree import json url = 'https://temp.163.com/special/00804KVA/cm_war.js?callback=data_callback' js_data = requests.get(url=url).text ex = 'data_callback((.*?))' list_str = re.findall(ex,js_data,re.S)[0] list_obj = json.loads(list_str) for dic in list_obj: title = dic['title'] detail_url = dic['docurl'] page_text = requests.get(url=detail_url).text tree = etree.HTML(page_text) content = tree.xpath('//*[@id="endText"]//text()') content = ''.join(content).replace(' ','').replace(' ','') print(content)
下面是scrapy和selenium配合的使用
# spider.py # -*- coding: utf-8 -*- import scrapy from Net163.items import Net163Item from selenium import webdriver from selenium.webdriver import ChromeOptions class NetPageSpider(scrapy.Spider): option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) # 创建一个浏览器对象 bro = webdriver.Chrome(executable_path=r'C:spiderscrapy1chromedriver.exe') name = 'net_page' model_urls = [] # 存放的就是4个板块对应的详情页的url # allowed_domains = ['www.xxx.com'] start_urls = ['https://news.163.com'] # 内容详情页面 def content_parse(self,response): item = response.meta['item'] # 解析数据 存储到item里面 content_lst = response.xpath('//div[@id="endText"]//text()').extract() # extract 返回的是列表,列表里面存的是字符串 item['desc'] = ''.join(content_lst).replace(' ','').replace(' ','').replace(' ','') # 拼接str yield item # 板块页面 def detail_parse(self, response): div_lst = response.xpath('////div[@class="ndi_main"]/div') for div in div_lst: item = Net163Item() title = div.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first() item['title'] = title # meta是一个字典,字典中所有的键值对都可以传递给指定的回调函数 yield scrapy.Request(url=new_detail_url, callback=self.content_parse,meta={'item':item}) # 开始的start_urls def parse(self, response): li_lst = response.xpath('//div[@class="ns_area list"]/ul/li') indexs = [3,4,6,7] model_lst = [] #板块 for index in indexs: li = li_lst[index] model_lst.append(li) # 解析板块url for li in model_lst: model_url = li.xpath('./a/@href').extract_first() self.model_urls.append(model_url) # 对每个板块的url发请求,获取板块页面内容数据 yield scrapy.Request(url=model_url,callback=self.detail_parse) # 关闭浏览器 def closed(self, spider): self.bro.quit()
# items.py import scrapy class Net163Item(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() desc = scrapy.Field()
#pipelines.py # 管道 负责持久化的 可以txt mysql redis mongodb 可以写多个类注册到settings,py里面 class Net163Pipeline(object): def process_item(self, item, spider): print(item['title'],len(item['desc'])) return item
# settings.py 里面的修改 USER_AGENT = '' # UA检测 ROBOTSTXT_OBEY = False # robot协议 DOWNLOADER_MIDDLEWARES = { #下载中间件 'Net163.middlewares.Net163DownloaderMiddleware': 543, } ITEM_PIPELINES = { #管道类 'Net163.pipelines.Net163Pipeline': 300, } LOG_LEVEL = 'ERROR' #日志等级
# middlewares.py里面
# -*- coding: utf-8 -*- from time import sleep from scrapy import signals from scrapy.http import HtmlResponse class Net163DownloaderMiddleware(object): # 类方法 @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 处理请求 def process_request(self, request, spider): return None # 该方法可以拦截到所有的响应对象,(需求中需要处理的是指定的某些响应对象) def process_response(self, request, response, spider): # 找出指定的响应对象进行处理操作 # 可以跟根据指定的请求对象定位到指定的响应对象 # 指定的请求对象可以通过请求的url定位 model_urls = spider.model_urls bro = spider.bro if request.url in model_urls: # 通过指定的url定位到指定的request # 通过指定的request定位到指定的response(不符合需求的要求) # 自己手动的创建四个符合需求要求的新的响应对象(需要将符合要求的响应数据存储放置到新的响应对象中) # 使用新的响应对象替换原来原始的响应对象 bro.get(request.url) # 使用浏览器对4个板块发请求的url sleep(2) js = 'window.scrollTo(0,document.body.scrollHeight)' bro.execute_script(js) sleep(2) # 页面源码数据就会包含了加载的动态数据 page_text = bro.page_source # 手动创建一个新的响应对象,将page_text作为响应数据封装到该响应对象中 # body参数表示的就是响应数据 return HtmlResponse(url=bro.current_url,body=page_text,encoding='utf-8',request=request) # 旧的响应对象 return response # 处理异常 def process_exception(self, request, exception, spider): pass # 开启爬虫 def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)