zoukankan      html  css  js  c++  java
  • 今日头条app数据爬虫demo

    import json
    import time
    from urllib.parse import quote
    from urllib import request
    import requests
    """
    1.综合
    2.视屏
    3.资讯
    4.小视屏
    5.图片
    6.用户
    7.音乐
    8.问答
    9.微头条
    10.话题
    """
    tab_list = [
        "pd=synthesis&from=search_tab",
        "pd=video&from=video",
        "pd=information&from=news",
        "pd=xiaoshipin&from=xiaoshipin",
        "pd=atlas&from=gallery",
        "pd=user&from=media",
        "pd=music&from=music",
        "pd=question&from=question",
        "pd=weitoutiao&from=weitoutiao",
        "pd=huati&from=huati"
    ]
    headers = {
            "User-Agent": "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1"
        }
    
    #keyWords关键词,page分页,tab模块
    def queryList(keyWords,page,tab):
        keyWords = quote(keyWords, safe=";/?:@&=+$,", encoding="utf-8")
        #秒  毫秒
        time_second,time_second_min = get_time()
        count = 10
        offset = (page-1) * count
        tab_str = tab_list[tab]
        url = "http://ic.snssdk.com/api/search/content/?qc_query=&offset="+str(offset)+"&action_type=input_keyword_search&has_count=&is_from_native=1&count="+str(count)+"&format=json&source=input&keyword_type=&search_id=&search_position=search_bar&"+tab_str+"&keyword="+str(keyWords)+"&from_search_subtab=1&iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=475404%2C680425%2C687252%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C690975%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C555254%2C471406%2C603441%2C700492%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C635529%2C669649%2C662099%2C696796%2C701078%2C693364%2C703077%2C697038%2C703339%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C671426%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C700459%2C625065%2C652953%2C696373%2C696990%2C698915%2C693900%2C703230%2C680284%2C638336%2C467514%2C679100%2C697663%2C702714%2C702994%2C699109%2C702878%2C699036%2C595556%2C697759%2C702757%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C700437%2C486951%2C701439%2C662176%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C655988%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket="+str(time_second_min)+"&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts="+str(time_second)+"&as=a2c555b4d565fcd9004533&mas=005bc89b119dd3e1d3f552f76df48fc2a6f6cdc4e4660e08ab"
        response = requests.post(url=url,timeout=100,headers=headers)
        response_str = str(response.content,encoding="utf-8")
        print(response_str)
        result_json = json.loads(response_str)
        return result_json
    
    def test():
        url = "http://ic.snssdk.com/api/search/content/?qc_query=&offset=10&action_type=input_keyword_search&has_count=&is_from_native=1&count=10&format=json&source=input&keyword_type=&search_id=&search_position=search_bar&pd=information&from=news&keyword=%E5%8D%8E%E4%B8%BA&from_search_subtab=3&iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C705593%2C700437%2C486951%2C705039%2C662176%2C475404%2C680425%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C705603%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C704639%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C471406%2C603441%2C700330%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C705212%2C635529%2C669649%2C662099%2C696796%2C701078%2C705329%2C703077%2C697038%2C704409%2C705653%2C703339%2C704524%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C625065%2C652953%2C696373%2C696990%2C698915%2C700040%2C703230%2C680284%2C638336%2C467514%2C679100%2C704230%2C702714%2C705221%2C699109%2C702878%2C704145%2C699036%2C595556%2C697759%2C705405%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket=1547795488503&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts=1547795488&as=a2c5879430624c8cd12044&mas=00f71df35ab69fe5b9d8e4e1ec4ea19fc10f42e68cc0e4e63a"
        headers = {
            "User-Agent": "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1"
        }
        response = requests.post(url=url, timeout=100, headers=headers)
        text = str(response.content,encoding='utf-8')
        print(text)
    
    def get_detail_url(result_list):
        # 秒  毫秒
        time_second, time_second_min = get_time()
        detail_url_param = "iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=475404%2C680425%2C687252%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C690975%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C555254%2C471406%2C603441%2C700492%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C635529%2C669649%2C662099%2C696796%2C701078%2C693364%2C703077%2C697038%2C703339%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C671426%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C700459%2C625065%2C652953%2C696373%2C696990%2C698915%2C693900%2C703230%2C680284%2C638336%2C467514%2C679100%2C697663%2C702714%2C702994%2C699109%2C702878%2C699036%2C595556%2C697759%2C702757%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C700437%2C486951%2C701439%2C662176%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C655988%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket=" + 
            str(time_second_min) + "&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts=" + 
            str(time_second) + "&as=a2c555b4d565fcd9004533&mas=005bc89b119dd3e1d3f552f76df48fc2a6f6cdc4e4660e08ab"
        detail_url_head = "http://a.pstatp.com/article/full/22/1/"
        detail_url_center = "/0/0/0/0?"
        comment_url_head = "https://www.toutiao.com/api/comment/list/?group_id="
        comment_url_tail = "&offset=0&count=5"
        detail_list = []
        for item in result_list:
            id_str = item.get("id", None)
            if not id_str:
                id_str = item.get("group_id", None)
            title = item.get("title",None)
            detail_url = detail_url_head + str(id_str) + "/" + str(id_str) + detail_url_center + detail_url_param
            comment_url = comment_url_head + str(id_str) + "&item_id=" + str(id_str) + comment_url_tail
            detail_data = {
                "detailUrl" : detail_url,
                "commentUrl" : comment_url,
                "id" : id_str,
                "title" : title
            }
            detail_list.append(detail_data)
        return detail_list
    
    def load_detail(detail_list):
        if len(detail_list) < 1:
            return
        for item in detail_list:
            detailUrl = item["detailUrl"]
            commentUrl = item["commentUrl"]
            id_str = item["id"]
            title = item.get("title",None)
            if title:
                title = title.replace("/","").replace("
    ","").replace("
    ","").replace(" ","")
            else:
                continue
            response = requests.post(url=detailUrl, timeout=100, headers=headers)
            response_str = str(response.content, encoding="utf-8")
            print(response_str)
            response_json = json.loads(response_str,encoding="utf-8")
            with open("detail/"+str(id_str) + title + ".txt",mode="w",encoding="utf-8") as file:
                file.write(json.dumps(response_json,ensure_ascii = False))
            response = requests.post(url=commentUrl, timeout=100, headers=headers)
            response_str = str(response.content, encoding="utf-8")
            print(response_str)
            response_json = json.loads(response_str,encoding="utf-8")
            with open("comment/"+str(id_str) + title + ".txt",mode="w",encoding="utf-8") as file:
                file.write(json.dumps(response_json,ensure_ascii = False))
    
    
    #获取时间
    def get_time():
        # 毫秒
        t = time.time()
        time_second_min = int(round(t * 1000))
        #
        time_second = int(t)
        return time_second,time_second_min
    
    
    if __name__ == '__main__':
        # test()
        keyWords = input("请输入关键词:")
        page = input("请输入页数:")
        tab_index = input("请输入模块:")
        result_json = queryList(keyWords,int(page),int(tab_index))
        result_list = result_json["data"]
        detail_list = get_detail_url(result_list)
        print(detail_list)
        load_detail(detail_list)
  • 相关阅读:
    du
    date
    echo
    redis的多实例
    redis相关配置
    mariadb的安装与主从复制
    11.Flask-钩子函数
    Python之Linux下的virtualenv&&virtualenvwrapper
    Linux的防火墙概念
    安装ipython解释器
  • 原文地址:https://www.cnblogs.com/procedureMonkey/p/10320304.html
Copyright © 2011-2022 走看看