zoukankan      html  css  js  c++  java
  • 爬取微博文章内容,关键字搜索爬取

    # coding:utf-8
    import requests
    import json
    from bs4 import BeautifulSoup
    
    # 搜索页面
    def get_home_page(page):
        html = "https://s.weibo.com/article?q=%E5%8F%B0%E9%A3%8E%E5%B1%B1%E7%AB%B9&Refer=weibo_article&page={}".format(page)
        respone = requests.get(html)
        # print(respone.text)
        textHtml = respone.text
        soup = BeautifulSoup(textHtml,features="lxml")
        info_all = soup.select(".card-wrap div div h3 a")
        a_list = []
        for item in info_all:
            a_href = item['href']
            a_list.append(a_href)
    
        return a_list
    
    # 爬起文章页面
    def get_content(html_1):
        # html_1 = "https://weibo.com/ttarticle/p/show?id=2309404344349319115732" # 测试用的
        header = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding":"gzip, deflate, br",
            "Accept-Language":"zh-CN,zh;q=0.9",
            "Cache-Control":"no-cache",
            "Connection":"keep-alive",
            "Cookie":"SINAGLOBAL=3045531231804.757.1542339154287; UOR=,,www.baidu.com; wb_view_log=1536*8641.25; un=18722846347; wvr=6; wb_view_log_5816188628=1536*8641.25; wb_timefeed_5816188628=1; Ugrow-G0=169004153682ef91866609488943c77f; ALF=1582963311; SSOLoginState=1551427312; SCF=AqVoDry8DgrNPZLa7pkwesp4oyNNCWdcpgWQj1ZQ-7Z-GLuL8HEFsKKTy9LKsa0aIgLjeETtnBpvLqf1pFX-H9o.; SUB=_2A25xfJagDeRhGeNG6lQQ-CbKyTSIHXVSC49orDV8PUNbmtBeLXmmkW9NS2iTzhh_-emgxKLZvYfKTt4TWxAYQ4t0; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW_VVQdF_QKBI9FlsOrs.5c5JpX5KzhUgL.Fo-ReKqp1hnceon2dJLoIXzLxKnLB--LBo5LxK-LB.qL1hqLxKML1-2L1hBLxK-L1-zLBoBLxKnL1hBL1KqLxK-LBo5L12qLxKMLB-2L1-BR1h-t; SUHB=0A0JvAxWF_4iQM; YF-V5-G0=3717816620d23c89a2402129ebf80935; _s_tentry=login.sina.com.cn; Apache=8257276420249.566.1551427311659; YF-Page-G0=d52660735d1ea4ed313e0beb68c05fc5; ULV=1551427311697:7:2:3:8257276420249.566.1551427311659:1551408473325; webim_unReadCount=%7B%22time%22%3A1551427323547%2C%22dm_pub_total%22%3A0%2C%22chat_group_pc%22%3A0%2C%22allcountNum%22%3A41%2C%22msgbox%22%3A0%7D; WBStorage=f3685954b8436f62|undefined",
            "Host":"weibo.com",
            "Pragma": "no-cache",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
        }
        get_text = requests.get(html_1,headers=header)
        # print(get_text.text)
        soup_text = BeautifulSoup(get_text.text,features="lxml")
        article_title = soup_text.select("div.title")[0].text
        article_content = soup_text.select("div.WB_editor_iframe")[0].text
        print("获取成功")
        return {"article_title":article_title,"article_content":article_content}
    
    def run():
        # 获取五十页的关键字文章
        n = 100
        article_content = []
        for i in range(1,n):
            a_list = get_home_page(i)
            for url in a_list:
                try:
                    article = get_content(url)
                    article_content.append(article)
                except:
                    print("内容有问题")
        with open("./article_file/all_article.json","w",encoding="gbk") as f:
            json.dump(article_content,f)
    
    if __name__=="__main__":
        run()
    

      

  • 相关阅读:
    Windows Server 2008 IIS7.0 发布html和Asp.net网站
    GS+9.0地统计软件学习
    将Mxd文件压缩并上传到Geodatabase!
    Git代码版本管理
    一台机器,两个Oracle数据库,两个SDE服务!
    翻译:A Picturebox Control to Display Both Remote Sensing and Regular Digital Images
    ENVI处理Modis数据学习
    C#开源资源
    winform中ToString()、DateTime.ToString()、DateTime
    加速Vs2008
  • 原文地址:https://www.cnblogs.com/wuzaipei/p/10458613.html
Copyright © 2011-2022 走看看