zoukankan      html  css  js  c++  java
  • 4-15 爬取新浪网

    import requests
     3 from bs4 import BeautifulSoup
     4 from datetime import datetime
     5 import re
     6 import json
     7 import pandas
     8 
     9 def getNewsdetial(newsurl):
    10     res = requests.get(newsurl)
    11     res.encoding = 'utf-8'
    12     soup = BeautifulSoup(res.text,'html.parser')
    13     newsTitle = soup.select('.page-header h1')[0].text.strip()
    14     nt = datetime.strptime(soup.select('.time-source')[0].contents[0].strip(),'%Y年%m月%d日%H:%M')
    15     newsTime = datetime.strftime(nt,'%Y-%m-%d %H:%M')
    16     newsArticle = getnewsArticle(soup.select('.article p'))
    17     newsAuthor = newsArticle[-1]
    18     return newsTitle,newsTime,newsArticle,newsAuthor
    19 def getnewsArticle(news):
    20     newsArticle = []
    21     for p in news:
    22          newsArticle.append(p.text.strip())
    23     return newsArticle
    24 
    25 # 获取评论数量
    26 
    27 def getCommentCount(newsurl):
    28     m = re.search('doc-i(.+).shtml',newsurl)
    29     newsid = m.group(1)
    30     commenturl = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
    31     comment = requests.get(commenturl.format(newsid))   #将要修改的地方换成大括号,并用format将newsid放入大括号的位置
    32     jd = json.loads(comment.text.lstrip('var data='))
    33     return jd['result']['count']['total']
    34 
    35 
    36 def getNewsLinkUrl():
    37 #     得到异步载入的新闻地址(即获得所有分页新闻地址)
    38     urlFormat = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1501000415111'
    39     url = []
    40     for i in range(1,10):
    41         res = requests.get(urlFormat.format(i))
    42         jd = json.loads(res.text.lstrip('  newsloadercallback(').rstrip(');'))
    43         url.extend(getUrl(jd))     #entend和append的区别
    44     return url
    45 
    46 def getUrl(jd):
    47 #     获取每一分页的新闻地址
    48     url = []
    49     for i in jd['result']['data']:
    50         url.append(i['url'])
    51     return url
    52 
    53 # 取得新闻时间,编辑,内容,标题,评论数量并整合在total_2中
    54 def getNewsDetial():
    55     title_all = []
    56     author_all = []
    57     commentCount_all = []
    58     article_all = []
    59     time_all = []
    60     url_all = getNewsLinkUrl()
    61     for url in url_all:
    62         title_all.append(getNewsdetial(url)[0])
    63         time_all.append(getNewsdetial(url)[1])
    64         article_all.append(getNewsdetial(url)[2])
    65         author_all.append(getNewsdetial(url)[3])
    66         commentCount_all.append(getCommentCount(url))
    67     total_2 = {'a_title':title_all,'b_article':article_all,'c_commentCount':commentCount_all,'d_time':time_all,'e_editor':author_all}
    68     return total_2
    69 
    70 # ( 运行起始点 )用pandas模块处理数据并转化为excel文档
    71 
    72 df = pandas.DataFrame(getNewsDetial())
    73 df.to_excel('news2.xlsx')
  • 相关阅读:
    maven中没找到settings.xml文件怎么办,简单粗暴
    如何修改新建后的maven的jdk版本号,简单粗暴
    如何修改maven下载的jar包存放位置,简单粗暴方法
    Kafka 温故(一):Kafka背景及架构介绍
    八、Kafka总结
    七、Kafka 用户日志上报实时统计之编码实践
    六、Kafka 用户日志上报实时统计之分析与设计
    五、Kafka 用户日志上报实时统计之 应用概述
    四、Kafka 核心源码剖析
    三、消息处理过程与集群维护
  • 原文地址:https://www.cnblogs.com/coder-2017/p/8848951.html
Copyright © 2011-2022 走看看