zoukankan      html  css  js  c++  java
  • 爬虫之爬取B站关键字

    # import  requests
    # r = requests.get('http://192.168.8.176:8089/mappuat_sit/user/login?loginNo=13068702992&loginPassword=1111112')
    # print(r.text)
    import requests
    import json
    import time
    # 网址
    url = "https://api.bilibili.com/x/web-interface/search/all/v2"
    
    # 浏览器代{过}{滤}理
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
        "referer": "https://api.bilibili.com/x/web-interface/search/all/v2?context=&page=1&order=&keyword=%E7%BC%96%E7%A8%8B&duration=&tids_1=&tids_2=&__refresh__=true&highlight=1&single_column=0&jsonp=jsonp&callback=__jp2"
    }
    
    # 网页后缀
    parameters = {
        "context": "",
        "page": "3",
        "order": "",
        "keyword": "Django",
        "duration": "",
        "tids_1": "",
        "tids_2": "",
        "__refresh__": "true",
        "search_type": "video",
        "highlight": "1",
        "single_column": "0",
        "jsonp": "jsonp",
        "callback": "__jp1",
    }
    
    # 格式化时间戳 转换成时间格式
    def Transformation_time(pud_times):
        timearray = time.localtime(pud_times)
        format_time = time.strftime("%Y-%m-%d %H:%M:%S", timearray)
        return format_time
    
    # 去除数据中的多余的杂乱字符
    def data_fromat(data):
        limit = ["#", ";", ",", " ", "【", "】", "
    ", "	", '
    ' ,'<emclass="keyword">', "</em>"]
        # 遍历分隔符 替换成空
        for i in limit:
            data = data.replace(i, "")
        return data
    
    
    # # 筛选数据并写入
    def screening_data(data):
        # 每一页的所有数据都在这个数组里
        information = data["data"]["result"][8]["data"]
        for x in range(len(information)):
            with open("d:/bilibili_programming11.txt","a",encoding="utf-8") as file:
                file.write(("up主:"+information[x]["author"]+"	"))
                file.write(("url:"+information[x]["arcurl"]+"	"))
                # 调用data_fromat剔除多余字符
                file.write(("标题:"+data_fromat(information[x]["title"])+"	"))
                # 调用data_fromat剔除多余字符
                file.write(("描述:" + data_fromat(information[x]["description"]) + "	"))
                file.write(("播放量:" + str(information[x]["play"]) + "	"))
                file.write(("弹幕量:" + str(information[x]["video_review"]) + "	"))
                file.write(("收藏量:" + str(information[x]["favorites"]) + "	"))
                file.write(("标签:" + information[x]["tag"] + "	"))
                file.write(("评论量:" + str(information[x]["review"]) + "	"))
                # 转换成时间格式
                file.write(("发布日期:" + Transformation_time(information[x]["pubdate"]) + "
    "))
                file.write(("时长(分):" + str(information[x]["duration"]) + "
    "))
    
    if __name__ == '__main__':
        for i in range(1, 51):
            if i >= 2:
                headers["referer"] = "https://api.bilibili.com/x/web-interface/search/all/v2?context=&page="+str(i-1)+"&order=&keyword=%E7%BC%96%E7%A8%8B&duration=&tids_1=&tids_2=&__refresh__=true&highlight=1&single_column=0&jsonp=jsonp&callback=__jp2"
            parameters["page"] = i
            # 获取的数据转成text然后去除多余字符,再从转成json格式
            datas = json.loads((requests.get(url,params=parameters,headers=headers).text).replace("__jp1(","").replace(")",""))
            screening_data(datas)
            time.sleep(3)
    

      

  • 相关阅读:
    Leetcode665.Non-decreasing Array非递减数组
    在MyEclipse中把多行代码用一快捷键注释掉
    struts2中addFieldError()方法
    [bzoj2588][Spoj10628]Count on a tree_主席树
    [bzoj3123][Sdoi2013]森林_主席树_启发式合并
    [bzoj1500][NOI2005]维修数列_非旋转Treap
    [bzoj1452][JSOI2009]Count_树状数组
    [bzoj1369][Baltic2003]Gem_树形dp_结论题
    [bzoj1195][HNOI2006]最短母串_动态规划_状压dp
    [bzoj2242][Sdoi2011]计算器_exgcd_BSGS
  • 原文地址:https://www.cnblogs.com/w770762632/p/13100575.html
Copyright © 2011-2022 走看看