zoukankan html css js c++ java

爬虫小技巧

爬虫小技巧

利用pycharm给字符(请求头)加引号

复制需要加引号的请求头

粘贴到pycharm中操作

chrome F12调试网页出现Paused in debugger解决办法

解决办法

电脑手动更换代理

# 有时候写爬虫分析网页,对方网站可能对ip做封禁处理,导致在浏览器无法拿到网页数据去分析.
# 这种情况我们就可以手动更换代理,然后就可以继续分析网页

# 代理网站(自己搜,我这里推荐几个)
- 免费代理：
    - 全网代理IP 	www.goubanjia.com 
    - 快代理     	https://www.kuaidaili.com/
    - 西祠代理   	https://www.xicidaili.com/nn/
    - 代理精灵   	http://http.zhiliandaili.cn/

构建UA请求池并使用

import random
class Spider(object):
    def __init__(self):
        self.user_agent = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
        ]
        self.headers = {}
    def run(self):
        self.headers['User-Agent'] = random.choice(self.user_agent)
        print(self.headers)

obj = Spider()
obj.run()

提示程序耗时

import time
import random
import datetime

def func():
    time.sleep(random.randint(1, 5))
    return None

if __name__ == '__main__':
    st = datetime.datetime.now()
    print('{}任务开始!!!!!!'.format(st.strftime('%Y-%m-%d %H:%M:%S')))
    func()
    et = datetime.datetime.now()
    print('{}任务结束!!!!!!耗时{}'.format(et.strftime('%Y-%m-%d %H:%M:%S'), et - st))

爬虫请求重要参数

import requests

session = requests.session()
# ua
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
# 请求参数
data = {
    "ajaxAction": True
}
# 代理
proxies = {'http': '182.99.184.248:25236'}  # 或{'https': '182.99.184.248:25236'}


def spider():
    try:
        response = session.get(url='www.xxx.com', headers=headers, data=data, proxies=proxies,
                               timeout=8)  # timeout=8请求重试时间 单位 秒
        if response.status_code == '200':
            return response
        return None
    except Exception as e:
        msg = f"""{e}
发生异常所在的文件：{e.__traceback__.tb_frame.f_globals[
            '__file__']}
报错报错位置:{e.__traceback__.tb_lineno}行"""
        print('未获取数据!!!', msg)
        return None

字符串日期加一天

import datetime

time_str = '2020-08-01'
time_str.split('-')[0].strip()
date = datetime.datetime(int(time_str.split('-')[0].strip()), int(time_str.split('-')[1].strip()),
                         int(time_str.split('-')[2].strip()))
# print(date)  2020-08-01 00:00:00
number = datetime.timedelta(days=1)
new_date = date + number
expect_time = new_date.strftime('%Y-%m-%d')
# print(expect_time) 2020-08-02

线程池与间隔固定时间执行程序

import time
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from spider import MM  # 爬虫类
from spider.proxy import get_proxyList  # 代理


def get_task_information():
    """获取信息查询任务"""
    try:
        url = 'www.xxx.com'
        rep = requests.get(url).json()
        return rep
    except Exception as e:
        print(e)
        return None


def save_function(data):
    """回调函数用于保存数据"""
    try:
        data = data.result()
        if data:
            url = 'www.xoo.com'
            rep = requests.post(url, data=data).text
            print(rep, '保存数据完成!!!!!!!!!!!!!!!!!')
        else:
            return None
    except Exception as e:
        print(e)
        return None


def execute_function(task):
    """调用函数"""
    try:
        obj = MM()  # 自己写的爬虫类
        data = obj.run(obj.delay_info, task)
        return data
    except Exception as e:
        print(e)
        return None


def main_function():
    """主程序"""
    try:
        response = get_task_information()
        if response.get('result'):
            task_list = [{'task': i} for i in response.get('result')]
            with ThreadPoolExecutor(max_workers=30) as tp:
                for task in task_list:
                    tp.submit(execute_function, task).add_done_callback(save_function)
                    as_completed(tp)
            return '任务处理完成!!!'
    except Exception as e:
        print(e)
        return None


t = threading.Thread(target=get_proxyList)
t.start()
while True:
    if not t.isAlive():
        t.start()
    try:
        st = datetime.datetime.now()
        print('{}任务开始!!!!!!'.format(st.strftime('%Y-%m-%d %H:%M:%S')))
        main_function()
        et = datetime.datetime.now()
        print('{}任务结束!!!!!!耗时{}'.format(et.strftime('%Y-%m-%d %H:%M:%S'), et - st))
        time.sleep(300)
    except BaseException as e:
        print(e)

作者：郭楷丰

出处：https://www.cnblogs.com/guokaifeng/

声援博主：如果您觉得文章对您有帮助，可以点击文章右下角 【推荐】一下。您的鼓励是博主的最大动力！

自勉：生活，需要追求；梦想，需要坚持；生命，需要珍惜；但人生的路上，更需要坚强。带着感恩的心启程，学会爱，爱父母，爱自己，爱朋友，爱他人。

查看全文

相关阅读:
线程原理创建方式
 Chapter17 【异常、线程】
驱动
 java中Super到底是什么意思
 Java 8后的首个长期支持版本Java 11
OpenJDK和JDK区别
 异常
 模拟斗地主洗牌发牌
 Debug追踪
 Python 垃圾回收机制详细

原文地址：https://www.cnblogs.com/guokaifeng/p/11860436.html

爬虫小技巧

爬虫小技巧

利用pycharm给字符(请求头)加引号

chrome F12调试网页出现Paused in debugger解决办法

电脑手动更换代理

构建UA请求池 并使用

提示程序耗时

爬虫请求重要参数

字符串日期加一天

线程池与间隔固定时间执行程序

构建UA请求池并使用