zoukankan      html  css  js  c++  java
  • python_day06(ip代理池)

    from urllib.request import Request, ProxyHandler
    from urllib.request import build_opener
    from bs4 import BeautifulSoup
    import MySQLdb;
    import redis
    from urllib.request import urlopen
    from lxml import etree
    from lxml import etree
    import re;
    urlfront = "http://www.xicidaili.com"
    url = "http://www.xicidaili.com/nn/1"
    result = redis.Redis(host='127.0.0.1', port=6379,db=0)
    
    # def spider_IP(url):
    # 获取整个页面
    def get_allcode(url):
        # 设置代理IP
        proxy = {'https': '110.73.0.45:8123'}
        proxy_support = ProxyHandler(proxy);
        opener = build_opener(proxy_support)
        # 设置访问http协议头,模拟浏览器
        opener.addheaders = [
            ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
        r = opener.open(url)
        html = r.read().decode("UTF-8");
        # print(html)
        return str(html)
    # lxml 方式 获取Ip
    def find_ip(s):
        # s = get_allcode(url);
        selector = etree.HTML(s);
        links = selector.xpath('//tr[@class="odd"]/td/text()|//tr[@class=""]/td/text()');
        ip=[]
        port=[]
        for link in links:
            # print(link)
            if '-' in link:
                # print()
                pass
            elif link.isdigit():
                port.append(link)
                # f.write(link + '
    ');
            elif '.' in link:
                ip.append(link)
                # f.write(link + ':');
        #  用redis 的 llist存 ip
        for i in range(len(ip)):
            # print(ip[i]+":"+port[i])
            ips=ip[i] + ":" + port[i]
            result.lpush('mylist',ips)
    def get_next_page(s):
        selecter = etree.HTML(s);
        link = selecter.xpath('//div[@class="pagination"]/a[@class="next_page"]/@href');
        for i in link:
            if i == None:
                return None;
            return urlfront + i
    def get_allcode_ip(url,ip):
        # 设置代理IP
        try:
            ip=str(ip, encoding="utf-8")# bytes与str相互转换
            timeout=5
            proxy = {'http':ip}
            proxy_support = ProxyHandler(proxy);
            opener = build_opener(proxy_support)
            # 设置访问http协议头,模拟浏览器
            opener.addheaders = [
                ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
            # 加运行超时
            r = opener.open(url,None,timeout)
            html = r.read().decode("UTF-8");
            print('+++++++++++++++')
            # 将可用Ip放到redis的useable_ip中
            result.lpush('usable_ip',ip)
            print(ip)
            print('+++++++++++++++')
        except Exception as err:
            print(err)
    while 1:
        print(url)
        s=get_allcode(url);
        url=get_next_page(s)
        print(url)
        if url==None:
            break
        find_ip(s)
        while 1:
            ip = result.lpop('mylist')
            print(ip)
            if ip == None:
                break
            get_allcode_ip(url, ip)
  • 相关阅读:
    'IDataObject': ambiguous symbol的解决方法
    捕获windows系统的sleep或hibernate状态
    CallingConvention理解
    Exception from HRESULT: 0x8001010D (RPC_E_CANTCALLOUT_ININPUTSYNCCALL))
    .Net Managed C++如何获取当前线程id和当前进程id
    div垂直居中于div中
    父级是relative,子级为absolute的情况下,子级宽度自适应
    background-img高度固定,图片自适应
    如何让两个input紧挨着.
    C# 调用百度短链接api
  • 原文地址:https://www.cnblogs.com/qieyu/p/7846110.html
Copyright © 2011-2022 走看看