zoukankan      html  css  js  c++  java
  • Python爬虫之反爬虫(随机user-agent,获取代理ip,检测代理ip可用性)

    python爬虫之反爬虫(随机user-agent,获取代理ip,检测代理ip可用性)

    目录

    随机User-Agent
    
    获取代理ip
    
    检测代理ip可用性
    

      

    随机User-Agent

    fake_useragent库,伪装请求头

    from fake_useragent import UserAgent
    
    ua = UserAgent()
    # ie浏览器的user agent
    print(ua.ie)
    
    # opera浏览器
    print(ua.opera)
    
    # chrome浏览器
    print(ua.chrome)
    
    # firefox浏览器
    print(ua.firefox)
    
    # safri浏览器
    print(ua.safari)
    
    # 最常用的方式
    # 写爬虫最实用的是可以随意变换headers,一定要有随机性。支持随机生成请求头
    print(ua.random)
    print(ua.random)
    print(ua.random)
    

    获取代理ip

    在免费的代理网站爬取代理ip,免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存

    代理ip网站
    有代理:https://www.youdaili.net/Daili/guonei/
    66代理:http://www.66ip.cn/6.html
    西刺代理:https://www.xicidaili.com/
    快代理:https://www.kuaidaili.com/free/
    
    #根据网页结果,适用正则表达式匹配
    #这种方法适合翻页的网页
    import re
    import requests
    import  time
    
    def get_ip():
        url='https://www.kuaidaili.com/free/inha/'
        url_list=[url+str(i+1) for i in range(5)] #生成url列表,5代表只爬取5页
        print(url_list)
        ip_list = []
        for i in range(len(url_list)):
            url =url_list[i]
            html = requests.get(url=url,).text
            regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
            matcher = re.compile(regip,re.S)
            ipstr = re.findall(matcher, html)
            time.sleep(1)
    
            for j in ipstr:
                ip_list.append(j[0]+':'+j[1])  #ip+port
        print(ip_list)
        print('共收集到%d个代理ip' % len(ip_list))
        return ip_list
    if __name__=='__main__':
        get_ip()
    

      

    #先获取特定标签
    #解析
    import requests
    from bs4 import BeautifulSoup
    def get_ip_list(obj):
        ip_text = obj.findAll('tr', {'class': 'odd'})   # 获取带有IP地址的表格的所有行
        ip_list = []
        for i in range(len(ip_text)):
            ip_tag = ip_text[i].findAll('td')   
            ip_port = ip_tag[1].get_text() + ':' + ip_tag[2].get_text() # 提取出IP地址和端口号
            ip_list.append(ip_port)
        print("共收集到了{}个代理IP".format(len(ip_list)))
        print(ip_list)
        return ip_list
    url = 'http://www.xicidaili.com/'
    headers = {
        'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
    request = requests.get(url, headers=headers)
    response =request.text
    bsObj = BeautifulSoup(response, 'lxml')     # 解析获取到的html
    lists=get_ip_list(bsObj)
    

     

     

     

    检测代理ip可用性

    第一种方法:通过返回的状态码判断

    import requests
    import random
    import re
    import time
    
    
    
    def get_ip():
        url='https://www.kuaidaili.com/free/inha/'
        url_list=[url+str(i+1) for i in range(1)]
        print(url_list)
        ip_list = []
        for i in range(len(url_list)):
            url =url_list[i]
            html = requests.get(url=url,).text
            regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
            matcher = re.compile(regip,re.S)
            ipstr = re.findall(matcher, html)
            time.sleep(1)
    
            for j in ipstr:
                ip_list.append(j[0]+':'+j[1])
        print('共收集到%d个代理ip' % len(ip_list))
        print(ip_list)
        return ip_list
    def valVer(proxys):
        badNum = 0
        goodNum = 0
        good=[]
        for proxy in proxys:
            try:
                proxy_host = proxy
                protocol = 'https' if 'https' in proxy_host else 'http'
                proxies = {protocol: proxy_host}
                print('现在正在测试的IP:',proxies)
                response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
                if response.status_code != 200:
                    badNum += 1
                    print (proxy_host, 'bad proxy')
                else:
                    goodNum += 1
                    good.append(proxies)
                    print (proxy_host, 'success proxy')
            except Exception as e:
                print( e)
                # print proxy_host, 'bad proxy'
                badNum += 1
                continue
        print ('success proxy num : ', goodNum)
        print( 'bad proxy num : ', badNum)
        print(good)
    
    if __name__ == '__main__':
        ip_list=get_ip()
        valVer(ip_list)
    

      

    第二种方法:使用requests包来进行验证

    import requests
    import random
    import re
    import time
    
    
    
    def get_ip():
        url='https://www.kuaidaili.com/free/inha/'
        url_list=[url+str(i+1) for i in range(1)]
        print(url_list)
        ip_list = []
        for i in range(len(url_list)):
            url =url_list[i]
            html = requests.get(url=url,).text
            regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
            matcher = re.compile(regip,re.S)
            ipstr = re.findall(matcher, html)
            time.sleep(1)
    
            for j in ipstr:
                ip_list.append(j[0]+':'+j[1])
        print(ip_list)
        print('共收集到%d个代理ip' % len(ip_list))
        return ip_list
    def valVer(proxys):
        badNum = 0
        goodNum = 0
        good=[]
        for proxy in proxys:
            print("现在正在检测ip",proxy)
            try:
                requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://"+str(proxy)}, timeout=2)
            except:
                badNum+=1
                print('connect failed')
            else:
                goodNum=1
                good.append(proxy)
                print('success')
    
        print ('success proxy num : ', goodNum)
        print( 'bad proxy num : ', badNum)
        print(good)
    
    if __name__ == '__main__':
        ip_list=get_ip()
        valVer(ip_list)

    第三种方法:使用telnet

    import telnetlib
    
    
    try:
    
        telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
    
    except:
    
        print 'connect failed'
    
    else:
        
        print 'success'
    

      

  • 相关阅读:
    316 Remove Duplicate Letters 去除重复字母
    315 Count of Smaller Numbers After Self 计算右侧小于当前元素的个数
    313 Super Ugly Number 超级丑数
    312 Burst Balloons 戳气球
    309 Best Time to Buy and Sell Stock with Cooldown 买股票的最佳时间含冷冻期
    Java 类成员的初始化顺序
    JavaScript 全局
    HTML字符实体
    Java中的toString()方法
    JavaScript 弹窗
  • 原文地址:https://www.cnblogs.com/-wenli/p/10211942.html
Copyright © 2011-2022 走看看