zoukankan      html  css  js  c++  java
  • python_day06(ip代理池)

    from urllib.request import Request, ProxyHandler
    from urllib.request import build_opener
    from bs4 import BeautifulSoup
    import MySQLdb;
    import redis
    from urllib.request import urlopen
    from lxml import etree
    from lxml import etree
    import re;
    urlfront = "http://www.xicidaili.com"
    url = "http://www.xicidaili.com/nn/1"
    result = redis.Redis(host='127.0.0.1', port=6379,db=0)
    
    # def spider_IP(url):
    # 获取整个页面
    def get_allcode(url):
        # 设置代理IP
        proxy = {'https': '110.73.0.45:8123'}
        proxy_support = ProxyHandler(proxy);
        opener = build_opener(proxy_support)
        # 设置访问http协议头,模拟浏览器
        opener.addheaders = [
            ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
        r = opener.open(url)
        html = r.read().decode("UTF-8");
        # print(html)
        return str(html)
    # lxml 方式 获取Ip
    def find_ip(s):
        # s = get_allcode(url);
        selector = etree.HTML(s);
        links = selector.xpath('//tr[@class="odd"]/td/text()|//tr[@class=""]/td/text()');
        ip=[]
        port=[]
        for link in links:
            # print(link)
            if '-' in link:
                # print()
                pass
            elif link.isdigit():
                port.append(link)
                # f.write(link + '
    ');
            elif '.' in link:
                ip.append(link)
                # f.write(link + ':');
        #  用redis 的 llist存 ip
        for i in range(len(ip)):
            # print(ip[i]+":"+port[i])
            ips=ip[i] + ":" + port[i]
            result.lpush('mylist',ips)
    def get_next_page(s):
        selecter = etree.HTML(s);
        link = selecter.xpath('//div[@class="pagination"]/a[@class="next_page"]/@href');
        for i in link:
            if i == None:
                return None;
            return urlfront + i
    def get_allcode_ip(url,ip):
        # 设置代理IP
        try:
            ip=str(ip, encoding="utf-8")# bytes与str相互转换
            timeout=5
            proxy = {'http':ip}
            proxy_support = ProxyHandler(proxy);
            opener = build_opener(proxy_support)
            # 设置访问http协议头,模拟浏览器
            opener.addheaders = [
                ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
            # 加运行超时
            r = opener.open(url,None,timeout)
            html = r.read().decode("UTF-8");
            print('+++++++++++++++')
            # 将可用Ip放到redis的useable_ip中
            result.lpush('usable_ip',ip)
            print(ip)
            print('+++++++++++++++')
        except Exception as err:
            print(err)
    while 1:
        print(url)
        s=get_allcode(url);
        url=get_next_page(s)
        print(url)
        if url==None:
            break
        find_ip(s)
        while 1:
            ip = result.lpop('mylist')
            print(ip)
            if ip == None:
                break
            get_allcode_ip(url, ip)
  • 相关阅读:
    将博客搬至CSDN
    Java 知识点
    Java--Socket通信
    java中判断list是否为空的用法
    Subversion代码提交中的org.apache.subversion.javahl.ClientException: svn: E200007: Commit failed异常解决
    springMVC3学习(二)--ModelAndView对象
    深入理解HTTP Session
    很不错的 VBA 网址
    delphi raised exception class EConvertError
    SQL INSERT INTO 语句
  • 原文地址:https://www.cnblogs.com/qieyu/p/7846110.html
Copyright © 2011-2022 走看看