zoukankan      html  css  js  c++  java
  • python3.5爬虫实例:根据网站的反爬虫策略,启用代理来防止爬虫被禁用

    网站反爬虫:一个IP频繁访问就先将该IP加入黑名单
    反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP
    策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用
    策略三:针对cookies:一般网站不使用

    本例利用反爬虫策略来抓取糗事百科的段子
     1 #网站反爬虫:一个IP频繁访问就先将该IP加入黑名单
     2 #反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP
     3 #策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用
     4 #策略三:针对cookies:一般网站不使用
     5 
     6 import requests
     7 import re
     8 import random
     9 import time
    10 
    11 #首先,我们找一个发布代理IP的网站,从该网站爬取代理IP来访问网页,当本地IP失效,启用代理IP
    12 
    13 class download(object):
    14     def __init__(self):
    15         self.ip_list=[]   #初始化列表用来存储获取到的IP
    16         html=requests.get("http://haoip.cc/tiqu.htm")
    17         iplistn=re.findall(r'r/>(.*?)<b',html.text,re.S)   #从html代码中获取所有/><b中的内容 re.S的意思是匹配包括所有换行符
    18         for ip in iplistn:
    19             i=re.sub("
    ","",ip)    #re.sub是re模块替换的方法,这表示将
    替换为空
    20             self.ip_list.append(i.strip())   #将IP添加到初始化列表中
    21 
    22         self.user_agent_list=[
    23             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    24             "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    25             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    26             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    27             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    28             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    29             "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    30             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    31             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    32             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    33             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    34             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    35             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    36             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    37             "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    38             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    39             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    40             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    41         ]
    42     def get(self,url,timeout,proxy=None,num_retries=6):
    43         ua=random.choice(self.user_agent_list)   #从user_agent_list中随机抽取出一个字符串
    44         # print(ua)
    45         header={"User-Agent":ua}  #构造一个完整的User_Agent
    46 
    47         if proxy==None:    #当代理为空时,不使用代理获取response
    48             try:
    49                 response=requests.get(url,headers=header,timeout=timeout)
    50                 return response
    51             except:
    52                 if num_retries>0:
    53                     time.sleep(10)
    54                     print(u"获取网页错误,10s后将获取倒数第:",num_retries,u"")
    55                     return self.get(url,timeout,num_retries-1)  #调用自身并将次数减1
    56                 else:
    57                     print(u"开始使用代理")
    58                     time.sleep(10)
    59                     IP="".join(str(random.choice(self.ip_list)).strip())
    60                     proxy={"http":IP}
    61                     return self.get(url,timeout,proxy)
    62 
    63         else:
    64             try:
    65                 IP="".join(str(random.choice(self.ip_list)).strip())   #随机取IP并去除空格
    66                 proxy={"http":IP}   #构造一个代理
    67                 response=requests.get(url,headers=header,proxies=proxy,timeout=timeout)  #使用代理来获取response
    68                 return response
    69             except:
    70                 if num_retries>0:
    71                     time.sleep(10)
    72                     IP="".join(str(random.choice(self.ip_list)).strip())
    73                     print(u"正在更换代理,10s后将重新获取第",num_retries,u"")
    74                     print(u"当前代理是:",proxy)
    75                     return self.get(url,timeout,proxy,num_retries-1)
    76                 else:
    77                     print(u"代理发生错误,取消代理")
    78                     return self.get(url,3)
    79 
    80 request=download();

    实现段子抓取

     1 #模拟抓取糗事百科的段子
     2 import requests
     3 from bs4 import BeautifulSoup
     4 from Download import request
     5 def qsbk(url):
     6     # header={
     7     #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
     8     #     'Accept-Encoding': 'gzip, deflate, sdch',
     9     #     'Accept-Language': 'zh-CN,zh;q=0.8',
    10     #     'Cache-Control': 'max-age=0',
    11     #     'Connection': 'keep-alive',
    12     #     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
    13     # }
    14     # rep=requests.get(url,headers=header)
    15     # html=rep.text
    16     # bs=BeautifulSoup(html,"html.parser")
    17     # body=bs.body   #获取html文件的body部分
    18     # data=body.find_all("div",{"class":"content"})    #此时的他为set 类型
    19     # for joke in data:
    20     #     joke_duan=joke.find("span")
    21     #     if "<br/>" not in str(joke_duan):    #如果段子中有<br/>,则string会变为None
    22     #         print(joke_duan.string)
    23     #         print("")
    24     #         # with open("joke.txt","w") as f:
    25     #         #     f.write(joke_duan.string)
    26     html=request.get(url,3)
    27     dz=BeautifulSoup(html.text,"html.parser").find_all("div",{"class":"content"})   #获取一个集合
    28     # print(dz)
    29     # print(len(dz))
    30     for joke in dz:   #joke为一段html代码
    31         duanzi=joke.get_text()
    32         print(duanzi)
    33 
    34 if __name__=="__main__":
    35     url="http://www.qiushibaike.com/"
    36     qsbk(url)
    上述抓取例子有两个,第一个为没有启用反爬虫策略,第二个为启用了反爬虫策略


  • 相关阅读:
    Java补漏(一)
    PHP实现程序单例执行
    zabbix 配置外部邮件server发送邮件报警
    HTML+JavaScript实现链式运动特效
    对思归者的建议
    去除Notepad++打开文件后文字下面出现红色波浪线的问题
    ANSI是什么?
    Eclipse各版本代号一览表以及官网上有很多版本的eclipse,下载哪个版本比较合适呢?
    Java语言的发展史
    win10 64位JLink v8固件丢失修复总结
  • 原文地址:https://www.cnblogs.com/eric8899/p/6122759.html
Copyright © 2011-2022 走看看