zoukankan      html  css  js  c++  java
  • 异步爬取爱卡汽车论坛信息

    一、获取论坛对应的汽车fid

    import asyncio
    import time
    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin
    import json
    import re
    
    headers = {
        "Host": "www.xcar.com.cn",
        "Pragma": "no-cache",
        "Proxy-Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    
    
    res=requests.get(r"https://www.xcar.com.cn/bbs/",headers=headers)
    html=res.content.decode(res.apparent_encoding)
    
    soup=BeautifulSoup(html,'lxml')
    span_list=soup.find_all("span",id="w959")
    car_ids={}
    
    for span in span_list:
        fid=re.findall("fid=(.*)",span.find("a").attrs.get('href'))[0]
        name=span.find("a").text
        car_ids[fid]=name
    
    # 保存为本地json文件,方便后续使用
    car_ids_str=json.dumps(car_ids,ensure_ascii=False)
    with open("car.json", "w",encoding="utf-8") as f:
        f.write(car_ids_str)

    二、asyncio爬取论坛信息

    headers = {
        "Host": "www.xcar.com.cn",
        "Pragma": "no-cache",
        "Proxy-Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    
    with open("car.json", "rb") as f:
        car_ids = json.loads(f.read())
    
    # 异步爬取函数
    async def more_details(fid,page):
        _time=str(time.time()*1000)
        url = r"https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php"
        params = {
            "fid": fid,
            "orderby": "lastpost",
            "filter": "",
            "ondigest": "0",
            "page": page,
            "_": _time
        }
        res = requests.get(url=url, params=params,headers=headers)
        return res.json()
    
    def run(car_name):
        for k, v in car_ids.items():
            if car_name in v:
                car_id = k
                # 创建异步循环事件池
                loop = asyncio.get_event_loop()
                # 使用ensure_future创建异步爬取任务task,最后通过result()来获取结果
                task=[asyncio.ensure_future(more_details(car_id,i)) for i in range(1,50)]
                done, _ =loop.run_until_complete(asyncio.wait(task))
                for t in done:
                    res=t.result()
                    print(res)
                loop.close()
    run("奥迪A4L")
  • 相关阅读:
    剑指Offer-Python(6-10)
    Python对MySQL进行增删查改
    剑指Offer-Python(1-5)
    转载:Python中collections模块
    读取单词文件并查某个单词出现的次数
    Python正则表达式-换行的匹配
    Python爬虫-换行的匹配
    转载:Pycharm的常用快捷键
    Python 正则表达式
    Python的类与对象
  • 原文地址:https://www.cnblogs.com/angelyan/p/14216566.html
Copyright © 2011-2022 走看看