zoukankan      html  css  js  c++  java
  • 爬虫小技巧

    爬虫小技巧

    利用pycharm给字符(请求头)加引号

    • 复制需要加引号的请求头

    • 粘贴到pycharm中操作

    chrome F12调试网页出现Paused in debugger解决办法

    • 解决办法

    电脑手动更换代理

    # 有时候写爬虫分析网页,对方网站可能对ip做封禁处理,导致在浏览器无法拿到网页数据去分析.
    # 这种情况我们就可以手动更换代理,然后就可以继续分析网页
    
    # 代理网站(自己搜,我这里推荐几个)
    - 免费代理:
        - 全网代理IP 	www.goubanjia.com 
        - 快代理     	https://www.kuaidaili.com/
        - 西祠代理   	https://www.xicidaili.com/nn/
        - 代理精灵   	http://http.zhiliandaili.cn/
    

    构建UA请求池 并使用

    import random
    class Spider(object):
        def __init__(self):
            self.user_agent = [
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
            ]
            self.headers = {}
        def run(self):
            self.headers['User-Agent'] = random.choice(self.user_agent)
            print(self.headers)
    
    obj = Spider()
    obj.run()
    

    提示程序耗时

    import time
    import random
    import datetime
    
    def func():
        time.sleep(random.randint(1, 5))
        return None
    
    if __name__ == '__main__':
        st = datetime.datetime.now()
        print('{}任务开始!!!!!!'.format(st.strftime('%Y-%m-%d %H:%M:%S')))
        func()
        et = datetime.datetime.now()
        print('{}任务结束!!!!!!耗时{}'.format(et.strftime('%Y-%m-%d %H:%M:%S'), et - st))
    

    爬虫请求重要参数

    import requests
    
    session = requests.session()
    # ua
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
    }
    # 请求参数
    data = {
        "ajaxAction": True
    }
    # 代理
    proxies = {'http': '182.99.184.248:25236'}  # 或{'https': '182.99.184.248:25236'}
    
    
    def spider():
        try:
            response = session.get(url='www.xxx.com', headers=headers, data=data, proxies=proxies,
                                   timeout=8)  # timeout=8请求重试时间 单位 秒
            if response.status_code == '200':
                return response
            return None
        except Exception as e:
            msg = f"""{e}
    发生异常所在的文件:{e.__traceback__.tb_frame.f_globals[
                '__file__']}
    报错报错位置:{e.__traceback__.tb_lineno}行"""
            print('未获取数据!!!', msg)
            return None
    

    字符串日期加一天

    import datetime
    
    time_str = '2020-08-01'
    time_str.split('-')[0].strip()
    date = datetime.datetime(int(time_str.split('-')[0].strip()), int(time_str.split('-')[1].strip()),
                             int(time_str.split('-')[2].strip()))
    # print(date)  2020-08-01 00:00:00
    number = datetime.timedelta(days=1)
    new_date = date + number
    expect_time = new_date.strftime('%Y-%m-%d')
    # print(expect_time) 2020-08-02
    

    线程池与间隔固定时间执行程序

    import time
    import datetime
    import threading
    from concurrent.futures import ThreadPoolExecutor, as_completed
    
    import requests
    from spider import MM  # 爬虫类
    from spider.proxy import get_proxyList  # 代理
    
    
    def get_task_information():
        """获取信息查询任务"""
        try:
            url = 'www.xxx.com'
            rep = requests.get(url).json()
            return rep
        except Exception as e:
            print(e)
            return None
    
    
    def save_function(data):
        """回调函数用于保存数据"""
        try:
            data = data.result()
            if data:
                url = 'www.xoo.com'
                rep = requests.post(url, data=data).text
                print(rep, '保存数据完成!!!!!!!!!!!!!!!!!')
            else:
                return None
        except Exception as e:
            print(e)
            return None
    
    
    def execute_function(task):
        """调用函数"""
        try:
            obj = MM()  # 自己写的爬虫类
            data = obj.run(obj.delay_info, task)
            return data
        except Exception as e:
            print(e)
            return None
    
    
    def main_function():
        """主程序"""
        try:
            response = get_task_information()
            if response.get('result'):
                task_list = [{'task': i} for i in response.get('result')]
                with ThreadPoolExecutor(max_workers=30) as tp:
                    for task in task_list:
                        tp.submit(execute_function, task).add_done_callback(save_function)
                        as_completed(tp)
                return '任务处理完成!!!'
        except Exception as e:
            print(e)
            return None
    
    
    t = threading.Thread(target=get_proxyList)
    t.start()
    while True:
        if not t.isAlive():
            t.start()
        try:
            st = datetime.datetime.now()
            print('{}任务开始!!!!!!'.format(st.strftime('%Y-%m-%d %H:%M:%S')))
            main_function()
            et = datetime.datetime.now()
            print('{}任务结束!!!!!!耗时{}'.format(et.strftime('%Y-%m-%d %H:%M:%S'), et - st))
            time.sleep(300)
        except BaseException as e:
            print(e)
    
    作 者:郭楷丰
    声援博主:如果您觉得文章对您有帮助,可以点击文章右下角 推荐一下。您的鼓励是博主的最大动力!
    自 勉:生活,需要追求;梦想,需要坚持;生命,需要珍惜;但人生的路上,更需要坚强。带着感恩的心启程,学会爱,爱父母,爱自己,爱朋友,爱他人。
  • 相关阅读:
    redis 设置分布式锁要避免死锁
    jmeter的简单http接口用法
    整理一些好的网站或者好的文章来慢慢学
    多线程的共享变量的内存不可见性如何理解
    谷歌浏览器可以google了
    org.apache.commons.dbcp.DelegatingPreparedStatement.isClosed()Z和NewProxyPreparedStatement.isClosed()
    php封装curl,模拟POST和GET请求HTTPS请求
    PHP 轻量级 REST框架
    使用 spring封装的javamail linux服务器发送邮件失败解决
    安装Ruby、Sass在WebStrom配置Scss编译环境css自动压缩
  • 原文地址:https://www.cnblogs.com/guokaifeng/p/11860436.html
Copyright © 2011-2022 走看看