zoukankan      html  css  js  c++  java
  • 贴吧爬取

    # coding=utf-8
    import requests
    import re
    from requests_html import HTMLSession
    import pandas as pd
    import time
    
    session = HTMLSession()
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
        'Cookie': 'BAIDUID=0AD95F29B28B1C69CF12212918D35FC5:FG=1; BDUSS=xRTTRqU2poYXJxZmx5bTF0dm5iVERtdWRnTC1hbDJIbnltcGlOcmtuejk1VDViQVFBQUFBJCQAAAAAAAAAAAEAAAC4ED841cW4o8H6MjAxM8zs0KsAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP1YF1v9WBdbV0; BIDUPSID=0AD95F29B28B1C69CF12212918D35FC5; PSTM=1528257025; TIEBAUID=eaa5821fe8cd6332e9f74ebe; TIEBA_USERTYPE=4fe0d47f0a8a56b9153531e1; bdshare_firstime=1529484152117; STOKEN=fb86f516529f2e700875d976398014ccffa45fc25536938272acb3cef065221a; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; UM_distinctid=1651dfa3911746-0d266b95e90a93-163f6952-13c680-1651dfa391242f; Hm_lvt_addc40d255fca71b9b06a07c2397b42a=1533006153,1533094604,1533611406,1533637141; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=1421_21080_26921_20927; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1533711844,1533711977,1533781285,1533868202; 943657144_FRSVideoUploadTip=1; mo_originid=2; IS_NEW_USER=121622ee0999d777aa2e3fa8; BAIDU_WISE_UID=wapp_1533868860558_698; CLIENTWIDTH=375; CLIENTHEIGHT=667; LASW=375; fixedbarautopop=1; recommend_item_click=0; wise_device=1; pb_prompt=1; SET_PB_IMAGE_WIDTH=355; SEENKW=%E6%89%AB%E7%A0%81%23%C9%A8%C2%EB; CNZZDATA1272960286=201730737-1529483780-null%7C1533869631; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1533870061'
    }
    
    
    url_first = 'https://tieba.baidu.com/mo/q/m?kw=%E6%89%AB%E7%A0%81&pn=0&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=0&pn={}&is_ajax=1'
    all_first_urls = [url_first.format(50*i) for i in range(1,72)]
    
    all_fina_url = []
    
    def gen_all_urls(url):
        url_demo1 = 'https://tieba.baidu.com/mo/q/m?kw=%E6%89%AB%E7%A0%81&pn=0&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=0&pn=50&is_ajax=1'
    
        res = requests.get(url, headers=headers)
        aa = res.json()['data']['content']
    
        bb = re.findall('href="(/p/d+?lp=5027&mo_device=1&is_jingpost=0)"', aa)
    
        all_url = ['https://tieba.baidu.com' + i for i in bb]
        all_fina_url.extend(all_url)
    
    
    all_fina_data = []
    
    
    def get_single(url):
        info = {}
        url_demo2 = 'https://tieba.baidu.com/p/5819837590?lp=5027&mo_device=1&is_jingpost=0&pn=0&'
        r = session.get(url,headers=headers)
    
        all_text = r.html.find('div[lz="0"]')
        for i in all_text:
            info['回复']=i.text
            len(all_fina_data)
            all_fina_data.append(info)
    
    
    if __name__ == '__main__':
        for first_url in all_first_urls:
            gen_all_urls(first_url)
    
        for fina_url in all_fina_url:
            get_single(fina_url)
    
        df1 = pd.DataFrame(all_fina_data)
    
        df1.to_excel('扫码贴吧信息'+ time.strftime("%Y%m%d%H%M") + '.xlsx',
            index=False)
        print('done')
  • 相关阅读:
    ES6相关概念及新增语法
    正则表达式
    递归
    高阶函数和闭包
    严格模式
    this指向
    递归
    严格模式
    函数内部的this指向
    函数的定义和调用
  • 原文地址:https://www.cnblogs.com/Erick-L/p/9491866.html
Copyright © 2011-2022 走看看