zoukankan      html  css  js  c++  java
  • python3获取代理IP

    在GitHub 上找了个获取代理IP的脚本,发现已经失效了,所以自己改了下
    使用python3.8
    # -*- coding:UTF-8 -*-
    from bs4 import BeautifulSoup
    import subprocess as sp
    import requests
    import random
    import re
    import logging

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s- %(message)s')


    def get_proxys():
    # requestsSession可以自动保持cookie,不需要自己维护cookie内容
    S = requests.Session()
    # 西祠代理高匿IP地址
    target_url = 'http://www.xiladaili.com/gaoni/'
    # 完善的headers
    target_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/88.0.4324.146 Safari/537.36'}
    # get请求
    target_response = S.get(url=target_url, headers=target_headers)
    # utf-8编码
    target_response.encoding = 'utf-8'
    # 获取网页信息
    target_html = target_response.text
    # 获取idip_listtable
    bf1_ip_list = BeautifulSoup(target_html, 'lxml')
    bf2_ip_list = bf1_ip_list.find('tbody').find_all('tr')
    proxys_list = []

    for i in bf2_ip_list:
    iptxt = i.find('td').string
    try:
    ph = re.compile(r'd+.d+.d+.d+')
    mo = ph.search(iptxt)
    ipmo = mo.group()
    proxys_list.append(ipmo)
    logging.debug(proxys_list)
    except AttributeError:
    return None
    #print(proxys_list)
    return proxys_list


    def check_ip(ip, lose_time, waste_time):
    # 命令 -n 要发送的回显请求数 -w 等待每次回复的超时时间(毫秒)
    cmd = "ping -n 4 -w 4 %s"
    # 执行命令
    p = sp.Popen(cmd % ip, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    # 获得返回结果并解码
    out = p.stdout.read().decode("gbk")
    # 丢包数
    lose_time = lose_time.findall(out)
    # 当匹配到丢失包信息失败,默认为三次请求全部丢包,丢包数lose赋值为3
    if len(lose_time) == 0:
    lose = 3
    else:
    lose = int(lose_time[0])
    # 如果丢包数目大于2,则认为连接超时,返回平均耗时1000ms
    if lose > 2:
    # 返回False
    return 1000
    # 如果丢包数目小于等于2,获取平均耗时的时间
    else:
    # 平均时间
    average = waste_time.findall(out)
    # 当匹配耗时时间信息失败,默认三次请求严重超时,返回平均好使1000ms
    if len(average) == 0:
    return 1000
    else:
    average_time = int(average[0])
    # 返回平均耗时
    return average_time


    def initpattern():
    # 匹配丢包数
    lose_time = re.compile(u"丢失 = (d+)", re.IGNORECASE)
    # 匹配平均时间
    waste_time = re.compile(u"平均 = (d+)ms", re.IGNORECASE)
    return lose_time, waste_time


    def proxyip():
    # 初始化正则表达式
    lose_time, waste_time = initpattern()
    proxys_list = get_proxys()
    # 如果平均时间超过200ms重新选取ip
    while True:
    # 50IP中随机选取一个IP作为代理进行访问
    split_proxy = random.choice(proxys_list)
    # 获取IP
    ip = split_proxy
    logging.debug(ip)
    # 检查ip
    average_time = check_ip(ip, lose_time, waste_time)
    logging.debug(average_time)
    if average_time > 300:
    # 去掉不能使用的IP
    proxys_list.remove(split_proxy)
    # print("ip连接超时, 重新获取中!")
    if average_time < 300:
    break

    # 去掉已经使用的IP
    proxys_list.remove(split_proxy)
    proxy_dict = split_proxy
    # print("使用代理:", proxy_dict)
    return proxy_dict

    if __name__ == '__main__':
    # 获取IP代理
    proxyip(
     
  • 相关阅读:
    matplotlib 进阶之origin and extent in imshow
    Momentum and NAG
    matplotlib 进阶之Tight Layout guide
    matplotlib 进阶之Constrained Layout Guide
    matplotlib 进阶之Customizing Figure Layouts Using GridSpec and Other Functions
    matplotlb 进阶之Styling with cycler
    matplotlib 进阶之Legend guide
    Django Admin Cookbook-10如何启用对计算字段的过滤
    Django Admin Cookbook-9如何启用对计算字段的排序
    Django Admin Cookbook-8如何在Django admin中优化查询
  • 原文地址:https://www.cnblogs.com/fanpiao/p/15273086.html
Copyright © 2011-2022 走看看