selenium +scrapy 实现网易新闻
mongoDB 可视化:
https://robomongo.org/download Robo 3T
配置:
https://www.jianshu.com/p/0578b636f1bf
代理测试:
import random
import requests
import re
'''
利用访问http://icanhazip.com/返回的IP进行测试
说明:利用的http://icanhazip.com/返回的IP进行校验,如返回的是代理池的IP,说明代理有效,否则实际代理无效
'''
# 代理ip池
PROXIES_NEW = {
"https": [
"https://113.226.18.243:80",
"https://106.56.102.78:808"
]
}
lens = len(PROXIES_NEW['https'])
print(lens)
num = 1
while num <= lens:
try:
requests.adapters.DEFAULT_RETRIES = 3
proxies = PROXIES_NEW['https']
IP = random.choice(proxies)
b = re.findall('//(d+.d+.d+.d+):', IP)[0]
b = b.replace('.', '')
res = requests.get(url="http://icanhazip.com/", timeout=8, proxies={"https": "https://113.140.1.82:53281"})
proxyIP = res.text
a = proxyIP.replace('.', '')
print(a)
if int(a) == int(b):
print("代理IP:'" + proxyIP + "'有效!")
else:
print("返回不是代理池中的ip,代理IP无效!")
except:
print("代理IP无效!")
print(111)
num += 1
在线测试代理:
http://www.66ip.cn/yz/
婺源:
spider.py :
class WangyiSpider(RedisSpider):
name = 'wangyi'
#allowed_domains = ['www.xxxx.com']
start_urls = ['https://news.163.com']
def __init__(self):
#实例化一个浏览器对象(实例化一次)
self.bro = webdriver.Chrome(executable_path='/Users/bobo/Desktop/chromedriver')
#必须在整个爬虫结束后,关闭浏览器
def closed(self,spider):
print('爬虫结束')
self.bro.quit()import scrapy
from wangyipro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
model_urls = [] # 里面放置的是四个版块对应的详情页URL
def __init__(self):
self.bro = webdriver.Chrome(executable_path=r"C:UsersAdministratorDesktop爬虫day06-爬虫chromedriver.exe")
def news_content_parse(self, response):
"""
用来解析新闻详情页的内容, 需要接收一下传递过来的item对象
:param response:
:return:
"""
item = response.meta['item']
# extract()方法获取到的是一个列表,列表里面存的是字符串
content_list = response.xpath('//div[@id="endText"]//text()').extract()
item['news_content'] = "".join(content_list)
yield item
def parse_detail(self, response):
"""
用来解析这个版块对应页面的新闻数据, 在这里我们就只爬取新闻的名称
:param response:
:return:
"""
div_list = response.xpath('//div[@class="data_row news_article clearfix "]')
for div in div_list:
# 当前只解析到了新闻标题,还没有解析到新闻内容
item = WangyiproItem()
news_title = div.xpath('./div/div[1]//a/text()').extract_first()
news_detail_url = div.xpath('./div/div[1]//a/@href').extract_first()
item['news_title'] = news_title
# 发送请求,并获取新闻内容, 使用请求传参,将item传递给下一个解析方法
yield scrapy.Request(url=news_detail_url, callback=self.news_content_parse, meta={'item': item})
def parse(self, response):
"""
解析四个版块对应的URL
只有在取文本或取属性的时候才可以在xpath中调用extract操作
:param response:
:return:
"""
li_list = response.xpath('//div[@class="ns_area list"]/ul/li')
indexs = [3, 4, 6, 7]
model_li_list = [] # 此列表放置四个版块对应的li标签
for index in indexs:
li = li_list[index]
model_li_list.append(li)
print(model_li_list)
# 解析出四个版块对应的URL
for li in model_li_list:
model_url = li.xpath('./a/@href').extract_first()
self.model_urls.append(model_url)
# 接下来对每个版块的URL发起请求, 并获取详情页的页面源码数据
yield scrapy.Request(url=model_url, callback=self.parse_detail)
中间件:
import time
from scrapy import signals
from scrapy.http import HtmlResponse
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import time
from scrapy import signals
from scrapy.http import HtmlResponse
class WangyiproDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def process_request(self, request, spider):
return None
# 该方法拦截到的是所有的响应对象(需求中要拦截的是部分响应对象)
def process_response(self, request, response, spider):
"""
找出指定的响应对象进行解析操作
可以根据指定的请求对象定位到指定的响应对象
指定的请求对象可以通过请求的URL进行定位
定位到指定的URL spider.model_urls
:param request:
:param response:
:param spider:
:return:
"""
bro = spider.bro
if request.url in spider.model_urls:
# 通过指定的URL就定位到了指定的request
# 通过指定的requests就可以定位到指定的response(不符合要求的旧响应对象)
# 自己手动创建四个符合要求的新响应对象(需要将符合要求的响应数据放置到新的响应对象中)
# 使用新的响应对象替换原来原始的响应对象
bro.get(request.url) # 使用浏览器向对应的URL发起请求
time.sleep(2)
js = "window.scrollTo(0, document.body.scrollHeight)"
bro.execute_script(js)
time.sleep(2)
page_text = bro.page_source # 页面源码中就包含了动态加载出来的页面数据
# 手动创建一个新的响应对象,将page_text作为响应数据封装到该对象中
# body参数表示的就是响应数据
return HtmlResponse(url=bro.current_url, body=page_text, encoding='utf-8', request=request)
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
中间件文件
网易配置
BOT_NAME = 'wangyipro'
SPIDER_MODULES = ['wangyipro.spiders']
NEWSPIDER_MODULE = 'wangyipro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'wangyipro.middlewares.WangyiproDownloaderMiddleware': 543,
}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wangyipro.pipelines.WangyiproPipeline': 300,
}
LOG_LEVEL = 'ERROR'
配置文件