zoukankan      html  css  js  c++  java
  • python爬虫redis-ip代理池搭建几十万的ip数据--可以使用

    from bs4 import BeautifulSoup
    import requests,os,sys,time,random,redis
    from lxml import etree
    conn = redis.Redis(host='127.0.0.1',port=6379,db=0,decode_responses=True)
    def get_ip(page_url,headers,cookies,sui_ji_time):
    """
    爬取ip并组合ip使用的格式
    :param page_url:
    :param headers:
    :param cookies:
    :param sui_ji_time:
    :return:
    """
    try:

    print('{}--{}--{}--{}>>{}'.format('此程序睡眠时间',sui_ji_time,'正在爬取第',page_url,'的数据'))
    response = requests.get(page_url,headers=headers,cookies=cookies).text
    json_lxml = etree.HTML(response)
    table = json_lxml.xpath('//*[@id="list"]/table/tbody/tr')
    for i in table:
    html_ip = i.xpath('.//td[1]/text()')[0]
    html_ip_port = i.xpath('.//td[2]/text()')[0]
    html_ip_lei = i.xpath('.//td[4]/text()')[0]
    daili_ip = '{}{}:{}'.format('http://', html_ip, html_ip_port)
    if html_ip_lei == 'HTTP':
    ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei)
    else:
    ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei)
    except:

    print('{}--{}--{}--{}>>{}'.format('此程序睡眠时间', sui_ji_time, '正在爬取第', page_url, '的数据=========失败'))

    def ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei):
    """
    测试爬取下来的ip是否可用
    :param headers:
    :param cookies:
    :param sui_ji_time:
    :param daili_ip:
    :param html_ip_lei:
    :return:
    """
    print(daili_ip,'@@@@@@@@@@@@')
    # list1 = []
    try:
    requests.get('http://wenshu.court.gov.cn/', proxies={str(html_ip_lei): daili_ip})
    except:
    print('{}>>{}'.format(daili_ip,'不可用'))
    else:
    print('{}>>{}'.format(daili_ip,'可用'))
    """
    存储redis数据库
    """
    try:
    conn.sadd('proxy','{}+{}'.format(html_ip_lei,daili_ip))
    print('{}'.format('存储redis成功'))
    except:
    print('{}'.format('存储redis失败'))
    root_dir = '{}'.format('D:\web_xiangmu\biquge_tushu\代理')
    # list1.append({str(html_ip_lei): str(daili_ip)})
    if not os.path.exists(root_dir):
    os.mkdir(root_dir)
    print('{}'.format('创建成功'))
    # print('{}'.format('文件存在'))
    """
    存储文件以防丢失
    """
    try:
    with open(root_dir+'\'+'daili.text', "a+") as mon:
    mon.write('{}+{} '.format(html_ip_lei,daili_ip))
    print('{}>>>{}'.format(daili_ip,'写入成功'))
    except:
    print('{}'.format('写入失败'))




    def main():
    """
    爬取ip代理网站的全部ip
    并组合分页
    :return:
    """

    url = 'https://www.kuaidaili.com/free/inha/'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    'Referer': 'https://www.kuaidaili.com/free/inha/',
    }
    cookies = {
    'Cookie': 'channelid=0; sid=1575640807483263; _ga=GA1.2.757045199.1575642271; _gid=GA1.2.1903168241.1575642271; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1575642272,1575686420; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1575686420',
    }
    try:
    response = requests.get(url,headers=headers,cookies=cookies).text
    json_lxml = etree.HTML(response)
    ip_page = json_lxml.xpath('//ul/li[9]/a/text()')[0]
    ip_page_href = json_lxml.xpath('//ul/li[9]/a/@href')[0]
    sui_ji_time = random.choice(list_time_sleep)
    for page in range(1,int(ip_page)+1):
    page_url = '{}/{}/{}/{}'.format('https://www.kuaidaili.com',''.join(ip_page_href).split('/')[1],''.join(ip_page_href).split('/')[2],page)
    time.sleep(sui_ji_time)
    get_ip(page_url,headers,cookies,sui_ji_time)
    except:
    print('程序崩溃')

    if __name__ == '__main__':
    list_time_sleep = [5,10,15]
    zhu_sui_ji_time = random.choice(list_time_sleep)
    print('{}<<{}>>{}'.format('主程序随机睡眠时间',zhu_sui_ji_time,'秒'))
    time.sleep(zhu_sui_ji_time)
    main()


    """
    import redis,requests
    conn = redis.Redis(host='127.0.0.1',port=6379,db=0,decode_responses=True)
    ip = conn.srandmember('proxy')
    ip_add = ''.join(ip).split('+')
    zhen_ip = ip_add
    dict1 = {}
    # # 使用IP代理访问百度,测试代理地址是否有效
    try:
    requests.get('http://wenshu.court.gov.cn/', proxies={zhen_ip[0]: zhen_ip[1]})
    print('{}---{}>>>{}'.format(zhen_ip[0],zhen_ip[1],'可用'))
    except:
    #删除没用的ip
    conn.srem('proxy',zhen_ip[1] )
    print('{}---{}>>>{}'.format(zhen_ip[0], zhen_ip[1], '不可用'))
    dict1 = {zhen_ip[0]:zhen_ip[1]}}


    print(dict1)

    #<<<proxies=dict1>>>在请求头部添加这个参数就可以正常使用了
    """
  • 相关阅读:
    121. Best Time to Buy and Sell Stock
    70. Climbing Stairs
    647. Palindromic Substrings
    609. Find Duplicate File in System
    583. Delete Operation for Two Strings
    556 Next Greater Element III
    553. Optimal Division
    539. Minimum Time Difference
    537. Complex Number Multiplication
    227. Basic Calculator II
  • 原文地址:https://www.cnblogs.com/duanlinxiao/p/12001618.html
Copyright © 2011-2022 走看看