随机更换user-agent
每次url请求更换一次user-agent
1
|
pip install fake - useragent |
settings
1
2
3
4
|
DOWNLOADER_MIDDLEWARES = { # 'ArticleSpider.middlewares.MyCustomDownloaderMiddleware': 543, 'ArticleSpider.middlewares.RandomUserAgentMiddleware' : 400 , } |
middlewares
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
from fake_useragent import UserAgent class RandomUserAgentMiddleware( object ): def __init__( self , crawler): super (RandomUserAgentMiddleware, self ).__init__() self .ua = UserAgent() # 若settings中没有设置RANDOM_UA_TYPE的值默认值为random, # 从settings中获取RANDOM_UA_TYPE变量,值可以是 random ie chrome firefox safari opera msie self .ua_type = crawler.settings.get( 'RANDOM_UA_TYPE' , 'random' ) @classmethod def from_crawler( cls , crawler): return cls (crawler) def process_request( self , request, spider): def get_ua(): '''根据settings的RANDOM_UA_TYPE变量设置每次请求的User-Agent''' return getattr ( self .ua, self .ua_type) ua = get_ua() request.headers.setdefault( 'User-Agent' , get_ua()) |
ip代理
方案一:免费版
自定义函数获取网上的一些免费代理ip
settings
1
2
3
|
DOWNLOADER_MIDDLEWARES = { 'ArticleSpider.middlewares.RandomProxyMiddleware' : 400 , } |
middlewares
1
2
3
4
|
class RandomProxyMiddleware( object ): #动态设置ip代理 def process_request( self , request, spider): request.meta[ "proxy" ] = get_random_ip() # 这个自定义函数返回一个随机代理ip:port |
方案二:收费版
github上scrapy-proxies等等
在线打码
编码识别:由于验证码识别难度大,而且易更新,所以编码识别验证码(不推荐)
在线打码:调用已经开发好的在线验证码识别软件接口识别验证码。识别率在90%以上,并且效率高(推荐)
人工打码:识别率近100%,但是成本高(用于复杂的)
cookie禁用
一些网站会跟踪cookie,如果不需要登陆的网站,可禁用cookie,降低被ban概率,scrapy默认开启cookie
1
|
COOKIES_ENABLED = False |
自动限速
调整某些参数,如
1
2
|
AUTOTHROTTLE_ENABLED = True DOWNLOAD_DELAY = 3 |
selenium
官方文档 http://selenium-python-docs-zh.readthedocs.io/zh_CN/latest/
作用:浏览器操控
安装selenium
1
|
pip install selenium |
下载对应浏览器的驱动
http://selenium-python.readthedocs.io/installation.html
第三方(微博)登录知乎
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
import time from selenium import webdriver from scrapy.selector import Selector browser = webdriver.Chrome(executable_path = "D:/Package/chromedriver.exe" ) time.sleep( 2 ) # 延时为了让页面加载完 browser.get( "https://www.zhihu.com/#signin" ) browser.find_element_by_css_selector( ".qrcode-signin-cut-button" ).click() browser.find_element_by_css_selector( ".signup-social-buttons" ).click() browser.find_element_by_css_selector( ".js-bindweibo" ).click() #browser.switch_to.window(browser.window_handles[-1]) browser.find_element_by_css_selector( ".WB_iptxt" ).send_keys( "xxx" ) browser.find_element_by_css_selector( "input[node-type='passwd']" ).send_keys( "xxx" ) browser.find_element_by_css_selector( "a[node-type='submit']" ).click() time.sleep( 2 ) # 延时为了让页面加载完 browser.find_element_by_css_selector( "a[node-type='submit']" ).click() |
第三方(QQ)登录知乎
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# -*- coding: utf-8 -*- __author__ = 'hy' import time from selenium import webdriver from scrapy.selector import Selector browser = webdriver.Firefox(executable_path = "D:/Package/geckodriver.exe" ) # browser.get( "https://www.zhihu.com/#signin" ) time.sleep( 2 ) # 点击QQ browser.find_element_by_css_selector( ".qrcode-signin-cut-button" ).click() browser.find_element_by_css_selector( ".signup-social-buttons" ).click() time.sleep( 2 ) browser.find_element_by_css_selector( ".js-bindqq" ).click() time.sleep( 5 ) browser.switch_to.window(browser.window_handles[ - 1 ]) browser.switch_to.frame( "ptlogin_iframe" ) # iframe必须逐级切入 # 用户名 密码 # 隐藏初始界面 browser.execute_script( 'document.getElementById("qlogin").style="display: none;"' ) browser.execute_script( 'document.getElementsByClassName("authLogin").style="display: none;"' ) # 显示用户、密码输入界面 browser.execute_script( 'document.getElementById("web_qr_login").style="display: block;"' ) # browser.evaluate_script('document.getElementById("batch_quto").contentEditable = true') time.sleep( 5 ) # 输入用户、密码 elem_user = browser.find_element_by_name( "u" ).send_keys( "xxx" ) elem_pwd = browser.find_element_by_name( "p" ).send_keys( "xxx" ) elem_but = browser.find_element_by_id( "login_button" ).click() time.sleep( 5 ) |
scrapy集成selenium
为什么集成selenium
selenium取代下载器,编码难度大的操作交给selenium
优点:反爬虫难度大
缺点:同步selenium效率低,需要结合Twisted成异步
middleware方式
方式一
settings
1
2
3
|
DOWNLOADER_MIDDLEWARES = { 'ArticleSpider.middlewares.JSPageMiddleware' : 1 , } |
middlewares
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
from selenium import webdriver from scrapy.http import HtmlResponse import time class JSPageMiddleware( object ): def __init__( self ): # 使用同一个self,保证只打开一个浏览器,所有spider使用一个浏览器 self .browser = webdriver.Chrome(executable_path = "D:/Package/chromedriver.exe" ) super (JSPageMiddleware, self ).__init__() # 通过chrome请求动态网页 def process_request( self , request, spider): if spider.name = = "jobbole" : # self.browser = webdriver.Chrome(executable_path="D:/Package/chromedriver.exe") self .browser.get(request.url) time.sleep( 1 ) print ( "访问:{0}" . format (request.url)) # browser.quit() return HtmlResponse(url = self .browser.current_url, body = self .browser.page_source, encoding = "utf-8" , request = request) |
方式二
middlewares
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
from scrapy.http import HtmlResponse import time class JSPageMiddleware( object ): # 通过chrome请求动态网页 def process_request( self , request, spider): if spider.name = = "jobbole" : # self.browser = webdriver.Chrome(executable_path="D:/Package/chromedriver.exe") spider.browser.get(request.url) time.sleep( 1 ) print ( "访问:{0}" . format (request.url)) # browser.quit() return HtmlResponse(url = spider.browser.current_url, body = spider.browser.page_source, encoding = "utf-8" , request = request) |
spider
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
from selenium import webdriver from scrapy.xlib.pydispatch import dispatcher from scrapy import signals class JobboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = [ 'blog.jobbole.com' ] start_urls = [ 'http://blog.jobbole.com/all-posts/' ] def __init__( self ): # 使用同一个self,每个spider使用一个浏览器 self .browser = webdriver.Chrome(executable_path = "D:/Package/chromedriver.exe" ) super (JobboleSpider, self ).__init__() dispatcher.connect( self .spider_closed, signals.spider_closed) # 爬虫关闭后 def spider_closed( self , spider): self .browser.quit() |
scrapy集成selenium模拟登录
为什么不直接用selenium替代原生下载器?
selenium是同步的方式,如果每个页面采用selenium则导致爬虫效率极低,目前并没有scrapy中的Twisted结合selenium的异步方案,因此selenium不推荐替代原生下载器
scrapy集成selenium能做什么?
由于模拟登录是编码很难解决的问题 ,因此采用selenium解决;其它页面继续用原生下载器的异步下载方案
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
# -*- coding: utf-8 -*- import re import datetime try : import urlparse as parse except : from urllib import parse import scrapy from selenium import webdriver import time class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = [ "www.zhihu.com" ] start_urls = [ 'https://www.zhihu.com/' ] login_cookies = [] headers = { "HOST" : "www.zhihu.com" , "Referer" : "https://www.zhizhu.com" , 'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" } # selenium登录保存cookies def get_cookies( self ): browser = webdriver.Chrome(executable_path = "D:/Package/chromedriver.exe" ) time.sleep( 2 ) # 延时为了让页面加载完 browser.get( "https://www.zhihu.com/#signin" ) browser.find_element_by_css_selector( ".qrcode-signin-cut-button" ).click() browser.find_element_by_css_selector( ".signup-social-buttons" ).click() browser.find_element_by_css_selector( ".js-bindweibo" ).click() # browser.switch_to.window(browser.window_handles[-1]) browser.find_element_by_css_selector( ".WB_iptxt" ).send_keys( "xxx" ) browser.find_element_by_css_selector( "input[node-type='passwd']" ).send_keys( "xxx" ) browser.find_element_by_css_selector( "a[node-type='submit']" ).click() time.sleep( 2 ) # 延时为了让页面加载完 browser.find_element_by_css_selector( "a[node-type='submit']" ).click() login_cookies = browser.get_cookies() browser.close() # 第一步:先于parse方法执行,处理登陆逻辑。可以猜测,start_requests携带的cookie会给后续所有的访问自动带上 def start_requests( self ): return [scrapy.Request( 'https://www.zhihu.com/#signin' , headers = self .headers, cookies = self .login_cookies, callback = self .parse)] # 第二步:处理登陆后的逻辑 def parse( self , response): my_url = 'https://www.zhihu.com/people/edit' # 该页面是个人中心页,只有登录后才能访问 yield scrapy.Request(my_url, headers = self .headers) |
爬取知乎文章和问答
scrapy shell调试
1
2
|
scrapy shell - s USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" https: / / www.zhihu.com / question / 56320032 |
页面分析
chrome安装jsonview插件
xhr页面查看json数据,这样获取数据更轻松
表设计
为了避免可能解析不到的字段或无法插入的情况,需要给字段设置默认值
scrapy-redis分布式爬虫
优点:利用多台机器的宽带加速爬取,利用多台机器的ip加速爬取(单台机器需要限速防止ip被ban)
缺点:编码难度大于单机爬虫
分布式需要解决的问题
requests队列集中管理
去重集中管理
windows安装redis
1
|
https: / / github.com / MicrosoftArchive / redis / releases |
创建项目
1
|
scrapy startproject ScrapyRedisTest |
scrapy-redis: https://github.com/rmax/scrapy-redis
scrapy-redis源码分析
import redis # For standalone use. DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' PIPELINE_KEY = '%(spider)s:items' REDIS_CLS = redis.StrictRedis REDIS_ENCODING = 'utf-8' # Sane connection defaults. REDIS_PARAMS = { 'socket_timeout': 30, 'socket_connect_timeout': 30, 'retry_on_timeout': True, 'encoding': REDIS_ENCODING, } SCHEDULER_QUEUE_KEY = '%(spider)s:requests' SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' START_URLS_KEY = '%(name)s:start_urls' START_URLS_AS_SET = False
import six from scrapy.utils.misc import load_object from . import defaults # Shortcut maps 'setting name' -> 'parmater name'. SETTINGS_PARAMS_MAP = { 'REDIS_URL': 'url', 'REDIS_HOST': 'host', 'REDIS_PORT': 'port', 'REDIS_ENCODING': 'encoding', } def get_redis_from_settings(settings): """Returns a redis client instance from given Scrapy settings object. This function uses ``get_client`` to instantiate the client and uses ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You can override them using the ``REDIS_PARAMS`` setting. Parameters ---------- settings : Settings A scrapy settings object. See the supported settings below. Returns ------- server Redis client instance. Other Parameters ---------------- REDIS_URL : str, optional Server connection URL. REDIS_HOST : str, optional Server host. REDIS_PORT : str, optional Server port. REDIS_ENCODING : str, optional Data encoding. REDIS_PARAMS : dict, optional Additional client parameters. """ # 把settings文件的配置和defaults配置更新到params params = defaults.REDIS_PARAMS.copy() params.update(settings.getdict('REDIS_PARAMS')) # XXX: Deprecate REDIS_* settings. for source, dest in SETTINGS_PARAMS_MAP.items(): val = settings.get(source) if val: params[dest] = val # Allow ``redis_cls`` to be a path to a class. if isinstance(params.get('redis_cls'), six.string_types): params['redis_cls'] = load_object(params['redis_cls']) return get_redis(**params) # 调用get_redis # get_redis_from_settings函数的别名:from_settings,从这里可以知道这个文件是准备给其它文件调用的(这里没用。。) # Backwards compatible alias. from_settings = get_redis_from_settings # 连接redis def get_redis(**kwargs): """Returns a redis client instance. Parameters ---------- redis_cls : class, optional Defaults to ``redis.StrictRedis``. url : str, optional If given, ``redis_cls.from_url`` is used to instantiate the class. **kwargs Extra parameters to be passed to the ``redis_cls`` class. Returns ------- server Redis client instance. """ redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) url = kwargs.pop('url', None) if url: return redis_cls.from_url(url, **kwargs) else: return redis_cls(**kwargs)