zoukankan      html  css  js  c++  java
  • python3.5爬虫实例:根据网站的反爬虫策略,启用代理来防止爬虫被禁用

    网站反爬虫:一个IP频繁访问就先将该IP加入黑名单
    反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP
    策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用
    策略三:针对cookies:一般网站不使用

    本例利用反爬虫策略来抓取糗事百科的段子
     1 #网站反爬虫:一个IP频繁访问就先将该IP加入黑名单
     2 #反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP
     3 #策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用
     4 #策略三:针对cookies:一般网站不使用
     5 
     6 import requests
     7 import re
     8 import random
     9 import time
    10 
    11 #首先,我们找一个发布代理IP的网站,从该网站爬取代理IP来访问网页,当本地IP失效,启用代理IP
    12 
    13 class download(object):
    14     def __init__(self):
    15         self.ip_list=[]   #初始化列表用来存储获取到的IP
    16         html=requests.get("http://haoip.cc/tiqu.htm")
    17         iplistn=re.findall(r'r/>(.*?)<b',html.text,re.S)   #从html代码中获取所有/><b中的内容 re.S的意思是匹配包括所有换行符
    18         for ip in iplistn:
    19             i=re.sub("
    ","",ip)    #re.sub是re模块替换的方法,这表示将
    替换为空
    20             self.ip_list.append(i.strip())   #将IP添加到初始化列表中
    21 
    22         self.user_agent_list=[
    23             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    24             "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    25             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    26             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    27             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    28             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    29             "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    30             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    31             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    32             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    33             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    34             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    35             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    36             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    37             "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    38             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    39             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    40             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    41         ]
    42     def get(self,url,timeout,proxy=None,num_retries=6):
    43         ua=random.choice(self.user_agent_list)   #从user_agent_list中随机抽取出一个字符串
    44         # print(ua)
    45         header={"User-Agent":ua}  #构造一个完整的User_Agent
    46 
    47         if proxy==None:    #当代理为空时,不使用代理获取response
    48             try:
    49                 response=requests.get(url,headers=header,timeout=timeout)
    50                 return response
    51             except:
    52                 if num_retries>0:
    53                     time.sleep(10)
    54                     print(u"获取网页错误,10s后将获取倒数第:",num_retries,u"")
    55                     return self.get(url,timeout,num_retries-1)  #调用自身并将次数减1
    56                 else:
    57                     print(u"开始使用代理")
    58                     time.sleep(10)
    59                     IP="".join(str(random.choice(self.ip_list)).strip())
    60                     proxy={"http":IP}
    61                     return self.get(url,timeout,proxy)
    62 
    63         else:
    64             try:
    65                 IP="".join(str(random.choice(self.ip_list)).strip())   #随机取IP并去除空格
    66                 proxy={"http":IP}   #构造一个代理
    67                 response=requests.get(url,headers=header,proxies=proxy,timeout=timeout)  #使用代理来获取response
    68                 return response
    69             except:
    70                 if num_retries>0:
    71                     time.sleep(10)
    72                     IP="".join(str(random.choice(self.ip_list)).strip())
    73                     print(u"正在更换代理,10s后将重新获取第",num_retries,u"")
    74                     print(u"当前代理是:",proxy)
    75                     return self.get(url,timeout,proxy,num_retries-1)
    76                 else:
    77                     print(u"代理发生错误,取消代理")
    78                     return self.get(url,3)
    79 
    80 request=download();

    实现段子抓取

     1 #模拟抓取糗事百科的段子
     2 import requests
     3 from bs4 import BeautifulSoup
     4 from Download import request
     5 def qsbk(url):
     6     # header={
     7     #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
     8     #     'Accept-Encoding': 'gzip, deflate, sdch',
     9     #     'Accept-Language': 'zh-CN,zh;q=0.8',
    10     #     'Cache-Control': 'max-age=0',
    11     #     'Connection': 'keep-alive',
    12     #     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
    13     # }
    14     # rep=requests.get(url,headers=header)
    15     # html=rep.text
    16     # bs=BeautifulSoup(html,"html.parser")
    17     # body=bs.body   #获取html文件的body部分
    18     # data=body.find_all("div",{"class":"content"})    #此时的他为set 类型
    19     # for joke in data:
    20     #     joke_duan=joke.find("span")
    21     #     if "<br/>" not in str(joke_duan):    #如果段子中有<br/>,则string会变为None
    22     #         print(joke_duan.string)
    23     #         print("")
    24     #         # with open("joke.txt","w") as f:
    25     #         #     f.write(joke_duan.string)
    26     html=request.get(url,3)
    27     dz=BeautifulSoup(html.text,"html.parser").find_all("div",{"class":"content"})   #获取一个集合
    28     # print(dz)
    29     # print(len(dz))
    30     for joke in dz:   #joke为一段html代码
    31         duanzi=joke.get_text()
    32         print(duanzi)
    33 
    34 if __name__=="__main__":
    35     url="http://www.qiushibaike.com/"
    36     qsbk(url)
    上述抓取例子有两个,第一个为没有启用反爬虫策略,第二个为启用了反爬虫策略


  • 相关阅读:
    HDU 1009 FatMouse' Trade
    HDU 2602 (简单的01背包) Bone Collector
    LA 3902 Network
    HDU 4513 吉哥系列故事——完美队形II
    LA 4794 Sharing Chocolate
    POJ (Manacher) Palindrome
    HDU 3294 (Manacher) Girls' research
    HDU 3068 (Manacher) 最长回文
    Tyvj 1085 派对
    Tyvj 1030 乳草的入侵
  • 原文地址:https://www.cnblogs.com/eric8899/p/6122759.html
Copyright © 2011-2022 走看看