zoukankan      html  css  js  c++  java
  • 爬虫策略

    import time
    import queue
    import random
    import threading
    import requests
    
    class V2ProxyItem(object):
        USER_AGENT_LIST = [
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36
    "
        ]
        
        def __init__(self, handler, proxy, start_timestamp=None):
            self.handler = handler
            self.proxy = proxy
            self.start_timestamp = start_timestamp or time.time()
            
        def request(self, url):
            for agent in self.USER_AGENT_LIST:
                try:
                    headers = {
                        "User-Agent": agent
                    }
                    proxies = {
                        "http": self.proxy,
                        "https": self.proxy
                    }
                    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
                    res.close()
                    if res.status_code == 200:
                        self.start_timestamp = time.time() + random.randint(2, 5)
                        self.handler.push(self, 200)
                        return True, res.text
                except Exception as e:
                    self.start_timestamp = time.time() + 10*60 # 10分钟
                    self.handler.push(self, 503)
                    return False, str(e)
                
            self.start_timestamp = time.time() + 10*60 
            self.handler.push(self, 503)
            return False, res.text
    
    
    class V2ProxyHandler(object):
        QUEUE_200 = queue.Queue()
        QUEUE_503 = queue.Queue()
        
        def __init__(self):
            self.lock_status = False
            self.lock = threading.RLock()
            
        def initial(self, proxy_list):
            for ele in proxy_list:
                item = V2ProxyItem(self, ele)
                self.QUEUE_200.put(item)
                
        def pop(self):
            try:
                if self.QUEUE_200.qsize == 0:
                    with self.lock:
                        if self.QUEUE_200.qsize == 0:
                            self.get_503_to_200()
                while True:
                    item = self.QUEUE_200.get(timeout=10)
                    if item.start_timestamp > time.time():
                        self.push(item, 200)
                        continue
                    return item
            except Exception as e:
                return V2ProxyItem(self, None)
        
        def push(self, item, code):
            if not item.proxy:
                return
            if code == 200:
                self.QUEUE_200.put(item)
            else:
                self.QUEUE_503.put(item)
            
        def get_503_to_200(self):
            loop_counter = self.QUEUE_503.qsize()
            for _ in range(loop_counter):
                try:
                    item = self.QUEUE_503.get(block=False)
                    if item.start_timestamp < time.time():
                        self.QUEUE_200.put(item)
                    else:
                        self.QUEUE_503.put(item)
                except queue.Empty as e:
                    pass
    
    
    proxy_object = V2ProxyHanlder()
    proxy_object.initial(["47.22.1.20:8080", "127.0.0.1:5000"])
    proxy_item_object = proxy_object.pop()
    flag, text = proxy_item_object.request("https://www.amazon.cn/dp/B07XZR8GJZ/ref=lp_1397971071_1_14?s=music-players&ie=UTF8&qid=1599142215&sr=1-14")
    print(flag, text)
  • 相关阅读:
    webpackdevserver 找不到目录
    Sublime text 实用插件 包推荐
    github for windows 安装
    很有创意的广告
    介绍一个软件SnippetCompiler
    Nodepad++ ftp github for windows组合开发php
    c#读取写入文本文件
    什么是临时,什么是长久,什么是永久?
    c#操作xml(读,写)
    php连接mssql数据库的几种方式
  • 原文地址:https://www.cnblogs.com/xuqidong/p/13611227.html
Copyright © 2011-2022 走看看