zoukankan      html  css  js  c++  java
  • IP_POOL

    #!/usr/bin/python
    # -*-coding:utf-8-*-
    """
    @file : demo.py
    @time: 2017/11/15 15:58
    """


    from multiprocessing import Process

    import requests
    import redis
    import time
    try:
    from aiohttp.errors import ProxyConnectionError,ServerDisconnectedError,ClientResponseError,ClientConnectorError
    except:
    from aiohttp import ClientProxyConnectionError as ProxyConnectionError,ServerDisconnectedError,ClientResponseError,ClientConnectorError
    from fake_useragent import UserAgent
    import re


    HOST = 'localhost'
    PORT = 6379

    TEST_API = 'http://www.baidu.com'


    POOL_LOWER_THRESHOLD = 300 # 在IP池检测时: 数量小于这个数是调用爬虫进行抓取;如果大于这个数,则进行等待
    POOL_UPPER_THRESHOLD = 10000 # 临界值 在抓取IP的过程中,如果抓取后的个数大于这个数,则这个循环结束

    POOL_LEN_CHECK_CYCLE = 10 # 检测IP池是否有足够IP的频率
    VALID_CHECK_CYCLE = 5*60 # 检查代理是否有效的频率


    def get_page(url, options={}):
    ua = UserAgent()
    base_headers = {
    'User-Agent': ua.random,
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    headers = dict(base_headers, **options)
    print('Getting', url)
    try:
    r = requests.get(url, headers=headers)
    if r.status_code == 200:
    return r.text
    except ConnectionError:
    print('Crawling Failed', url)
    return None


    class ResourceDepletionError(Exception):

    def __init__(self):
    Exception.__init__(self)

    def __str__(self):
    return repr('The proxy source is exhausted')


    class FreeProxyGetter(object):

    def get_raw_proxies(self):
    proxies = []

    start_url = 'http://localhost:8080'
    html = get_page(start_url)
    ip_address = re.compile(r'd+.d+.d+.d+:d+')
    re_ip_address = ip_address.findall(html)
    for address in re_ip_address:
    proxies.append(address)

    return proxies


    class RedisClient(object):
    def __init__(self, host=HOST, port=PORT):
    self._db = redis.Redis(host=host, port=port)

    def get(self, count=1):
    proxies = self._db.lrange("proxies", 0, count-1)
    self._db.ltrim("proxies", count, -1) # ltrim对一个列表进行修剪,让列表只保留指定区间内的元素,不在指定区间之内的元素都将被删除
    return proxies

    def put(self, proxy):
    self._db.rpush("proxies", proxy)

    @property
    def queue_len(self):
    return self._db.llen("proxies")


    class PoolAdder(object):

    def __init__(self, threshold):
    self._threshold = threshold # 极限,临界值
    self._conn = RedisClient()
    self._tester = ValidityTester()
    self._crawler = FreeProxyGetter()

    def is_over_threshold(self):
    if self._conn.queue_len >= self._threshold:
    return True
    else:
    return False

    def add_to_queue(self):
    proxy_count = 0
    while not self.is_over_threshold():
    raw_proxies = self._crawler.get_raw_proxies() # return proxies
    self._tester.set_raw_proxies(raw_proxies)
    self._tester.test()
    proxy_count += len(raw_proxies)
    if proxy_count == 0:
    raise ResourceDepletionError


    class ValidityTester(object):
    test_api = TEST_API

    def __init__(self):
    self._raw_proxies = None
    self._usable_proxies = []

    def set_raw_proxies(self, proxies):
    self._raw_proxies = proxies
    self._conn = RedisClient()

    def test(self):
    for proxy in self._raw_proxies:
    if isinstance(proxy, bytes):
    proxy = proxy.decode('utf-8')
    real_proxy = 'http://' + proxy
    response = requests.get(self.test_api, proxies={'http:':real_proxy})
    if response.status_code == 200:
    self._conn.put(proxy)
    print('Valid proxy', proxy)
    else:
    print('Invalid proxy', proxy)


    class Schedule(object):
    @staticmethod
    def valid_proxy(cycle=VALID_CHECK_CYCLE):
    conn = RedisClient()
    tester = ValidityTester()

    count = int(0.5 * conn.queue_len)
    if count == 0:
    print('Waiting for adding')
    time.sleep(cycle)

    raw_proxies = conn.get(count=count) # ***

    tester.set_raw_proxies(raw_proxies)
    tester.test()
    time.sleep(cycle) # sleep 10s

    @staticmethod
    def check_pool(upper_threshold=POOL_UPPER_THRESHOLD,
    lower_threshold=POOL_LOWER_THRESHOLD,
    cycle=POOL_LEN_CHECK_CYCLE):
    conn = RedisClient()
    adder = PoolAdder(upper_threshold)
    num = 1
    while True:
    if conn.queue_len < lower_threshold:
    adder.add_to_queue()
    time.sleep(cycle)
    num = num + 1

    def run(self):
    print('IP processing running')
    valid_process = Process(target=Schedule.valid_proxy)
    check_process = Process(target=Schedule.check_pool)
    valid_process.start()
    check_process.start()


    def main():
    s = Schedule()
    s.run()


    if __name__ == '__main__':
    main()
  • 相关阅读:
    node.js(八 --- express)
    node.js(六 --- 文件系统模块fs)
    node.js( 五 --- 常用工具util)
    node.js(四 --- 全局对象)
    python 判断变量是否存在 防止报错
    python requests 的cookie 操作
    DDOS 攻击的防范
    python图片识别
    php常见问题-foreach和引用造成的问题。
    数据库数据类型选择
  • 原文地址:https://www.cnblogs.com/liyugeng/p/7845567.html
Copyright © 2011-2022 走看看