zoukankan      html  css  js  c++  java
  • Python爬虫之反爬虫(随机user-agent,获取代理ip,检测代理ip可用性)

    python爬虫之反爬虫(随机user-agent,获取代理ip,检测代理ip可用性)

    目录

    随机User-Agent
    
    获取代理ip
    
    检测代理ip可用性
    

      

    随机User-Agent

    fake_useragent库,伪装请求头

    from fake_useragent import UserAgent
    
    ua = UserAgent()
    # ie浏览器的user agent
    print(ua.ie)
    
    # opera浏览器
    print(ua.opera)
    
    # chrome浏览器
    print(ua.chrome)
    
    # firefox浏览器
    print(ua.firefox)
    
    # safri浏览器
    print(ua.safari)
    
    # 最常用的方式
    # 写爬虫最实用的是可以随意变换headers,一定要有随机性。支持随机生成请求头
    print(ua.random)
    print(ua.random)
    print(ua.random)
    

    获取代理ip

    在免费的代理网站爬取代理ip,免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存

    代理ip网站
    有代理:https://www.youdaili.net/Daili/guonei/
    66代理:http://www.66ip.cn/6.html
    西刺代理:https://www.xicidaili.com/
    快代理:https://www.kuaidaili.com/free/
    
    #根据网页结果,适用正则表达式匹配
    #这种方法适合翻页的网页
    import re
    import requests
    import  time
    
    def get_ip():
        url='https://www.kuaidaili.com/free/inha/'
        url_list=[url+str(i+1) for i in range(5)] #生成url列表,5代表只爬取5页
        print(url_list)
        ip_list = []
        for i in range(len(url_list)):
            url =url_list[i]
            html = requests.get(url=url,).text
            regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
            matcher = re.compile(regip,re.S)
            ipstr = re.findall(matcher, html)
            time.sleep(1)
    
            for j in ipstr:
                ip_list.append(j[0]+':'+j[1])  #ip+port
        print(ip_list)
        print('共收集到%d个代理ip' % len(ip_list))
        return ip_list
    if __name__=='__main__':
        get_ip()
    

      

    #先获取特定标签
    #解析
    import requests
    from bs4 import BeautifulSoup
    def get_ip_list(obj):
        ip_text = obj.findAll('tr', {'class': 'odd'})   # 获取带有IP地址的表格的所有行
        ip_list = []
        for i in range(len(ip_text)):
            ip_tag = ip_text[i].findAll('td')   
            ip_port = ip_tag[1].get_text() + ':' + ip_tag[2].get_text() # 提取出IP地址和端口号
            ip_list.append(ip_port)
        print("共收集到了{}个代理IP".format(len(ip_list)))
        print(ip_list)
        return ip_list
    url = 'http://www.xicidaili.com/'
    headers = {
        'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
    request = requests.get(url, headers=headers)
    response =request.text
    bsObj = BeautifulSoup(response, 'lxml')     # 解析获取到的html
    lists=get_ip_list(bsObj)
    

     

     

     

    检测代理ip可用性

    第一种方法:通过返回的状态码判断

    import requests
    import random
    import re
    import time
    
    
    
    def get_ip():
        url='https://www.kuaidaili.com/free/inha/'
        url_list=[url+str(i+1) for i in range(1)]
        print(url_list)
        ip_list = []
        for i in range(len(url_list)):
            url =url_list[i]
            html = requests.get(url=url,).text
            regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
            matcher = re.compile(regip,re.S)
            ipstr = re.findall(matcher, html)
            time.sleep(1)
    
            for j in ipstr:
                ip_list.append(j[0]+':'+j[1])
        print('共收集到%d个代理ip' % len(ip_list))
        print(ip_list)
        return ip_list
    def valVer(proxys):
        badNum = 0
        goodNum = 0
        good=[]
        for proxy in proxys:
            try:
                proxy_host = proxy
                protocol = 'https' if 'https' in proxy_host else 'http'
                proxies = {protocol: proxy_host}
                print('现在正在测试的IP:',proxies)
                response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
                if response.status_code != 200:
                    badNum += 1
                    print (proxy_host, 'bad proxy')
                else:
                    goodNum += 1
                    good.append(proxies)
                    print (proxy_host, 'success proxy')
            except Exception as e:
                print( e)
                # print proxy_host, 'bad proxy'
                badNum += 1
                continue
        print ('success proxy num : ', goodNum)
        print( 'bad proxy num : ', badNum)
        print(good)
    
    if __name__ == '__main__':
        ip_list=get_ip()
        valVer(ip_list)
    

      

    第二种方法:使用requests包来进行验证

    import requests
    import random
    import re
    import time
    
    
    
    def get_ip():
        url='https://www.kuaidaili.com/free/inha/'
        url_list=[url+str(i+1) for i in range(1)]
        print(url_list)
        ip_list = []
        for i in range(len(url_list)):
            url =url_list[i]
            html = requests.get(url=url,).text
            regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
            matcher = re.compile(regip,re.S)
            ipstr = re.findall(matcher, html)
            time.sleep(1)
    
            for j in ipstr:
                ip_list.append(j[0]+':'+j[1])
        print(ip_list)
        print('共收集到%d个代理ip' % len(ip_list))
        return ip_list
    def valVer(proxys):
        badNum = 0
        goodNum = 0
        good=[]
        for proxy in proxys:
            print("现在正在检测ip",proxy)
            try:
                requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://"+str(proxy)}, timeout=2)
            except:
                badNum+=1
                print('connect failed')
            else:
                goodNum=1
                good.append(proxy)
                print('success')
    
        print ('success proxy num : ', goodNum)
        print( 'bad proxy num : ', badNum)
        print(good)
    
    if __name__ == '__main__':
        ip_list=get_ip()
        valVer(ip_list)

    第三种方法:使用telnet

    import telnetlib
    
    
    try:
    
        telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
    
    except:
    
        print 'connect failed'
    
    else:
        
        print 'success'
    

      

  • 相关阅读:
    leetcode100
    leetcode237
    leetcode171
    leetcode122
    leetcode387
    2018-8-10-win10-uwp-如何打包Nuget给其他人
    2018-8-10-win10-uwp-如何打包Nuget给其他人
    2019-11-13-如何在国内发布-UWP-应用
    2019-11-13-如何在国内发布-UWP-应用
    2019-2-21-PowerShell-通过-WMI-获取设备厂商
  • 原文地址:https://www.cnblogs.com/-wenli/p/10211942.html
Copyright © 2011-2022 走看看