zoukankan      html  css  js  c++  java
  • scrapy实战9动态设置ip代理从数据库中随机获取一个可用的ip:

    在目录下创建tools(python package) 在tools中创建crawl_xici_ip.py文件写入代码如下:

    #coding=utf-8
    import requests
    from scrapy.selector import Selector
    import pymysql
    
    conn = pymysql.connect(host="127.0.0.1", user="username", passwd="userpassword", db="proxy_ip", charset="utf8")
    cursor = conn.cursor()
    
    def crawl_ips():
        #爬取西刺的免费ip代理
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
        for i in range(1568):
            re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
    
            selector = Selector(text=re.text)
            all_trs = selector.css("#ip_list tr")
    
    
            ip_list = []
            for tr in all_trs[1:]:
                speed_str = tr.css(".bar::attr(title)").extract()[0]
                if speed_str:
                    speed = float(speed_str.split("")[0])
                all_texts = tr.css("td::text").extract()
    
                ip = all_texts[0]
                port = all_texts[1]
                proxy_type = all_texts[5]
    
                ip_list.append((ip, port, proxy_type, speed))
    
            for ip_info in ip_list:
                cursor.execute(
                    "insert proxy(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
                        ip_info[0], ip_info[1], ip_info[3]
                    )
                )
    
                conn.commit()
    
    
    
    class GetIP(object):
        def delete_ip(self, ip):
            #从数据库中删除无效的ip
            delete_sql = """
                delete from proxy where ip='{0}'
            """.format(ip)
            cursor.execute(delete_sql)
            conn.commit()
            return True
    
        def judge_ip(self, ip, port):
            #判断ip是否可用
            http_url = "http://www.baidu.com"
            proxy_url = "http://{0}:{1}".format(ip, port)
            try:
                proxy_dict = {
                    "http":proxy_url,
                }
                response = requests.get(http_url, proxies=proxy_dict)
            except Exception as e:
                print ("invalid ip and port")
                self.delete_ip(ip)
                return False
            else:
                code = response.status_code
                if code >= 200 and code < 300:
                    print ("effective ip")
                    return True
                else:
                    print  ("invalid ip and port")
                    self.delete_ip(ip)
                    return False
    
    
        def get_random_ip(self):
            #从数据库中随机获取一个可用的ip
            random_sql = """
                  SELECT ip, port FROM proxy
                ORDER BY RAND()
                LIMIT 1
                """
            result = cursor.execute(random_sql)
            for ip_info in cursor.fetchall():
                ip = ip_info[0]
                port = ip_info[1]
    
                judge_re = self.judge_ip(ip, port)
                if judge_re:
                    return "http://{0}:{1}".format(ip, port)
                else:
                    return self.get_random_ip()
    
    
    
    print (crawl_ips())
    if __name__ == "__main__":
        get_ip = GetIP()
        get_ip.get_random_ip()

    在middlewares.py中添加代码如下:

    from tools.crawl_xici_ip import GetIP
    
    
    class RandomProxyMiddleware(object):
        #动态设置ip代理
        def process_request(self, request, spider):
            get_ip = GetIP()
            request.meta["proxy"] = get_ip.get_random_ip()

    在settings.py中配置

  • 相关阅读:
    HDU 3081 Marriage Match II
    HDU 4292 Food
    HDU 4322 Candy
    HDU 4183 Pahom on Water
    POJ 1966 Cable TV Network
    HDU 3605 Escape
    HDU 3338 Kakuro Extension
    HDU 3572 Task Schedule
    HDU 3998 Sequence
    Burning Midnight Oil
  • 原文地址:https://www.cnblogs.com/huwei934/p/7150795.html
Copyright © 2011-2022 走看看