zoukankan      html  css  js  c++  java
  • 【大数据】爬取全部的校园新闻

    1.从新闻url获取新闻详情: 字典,anews

    def anews(url):
        newsDetail={}
        res=requests.get(url)
        res.encoding='utf-8'
        soup=BeautifulSoup(res.text,'html.parser')
        newsDetail['newsTitle']=soup.select('.show-title')[0].text
        showinfo=soup.select('.show-info')[0].text
        newsDetail['newsDT']=newsdt(showinfo)
        newsDetail['newsClick']=click(url)
        return newsDetail
    

    2.从列表页的url获取新闻url:列表append(字典) alist

    listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res=requests.get(listUrl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    newsList=[]
    for news in  soup.select('li'):
          if len(news.select('.news-list-title'))>0:
                newsUrl=news.select('a')[0]['href']
                newsDesc=news.select('.news-list-description')[0].text
                newsDict=anews(newsUrl)
                newsDict['description']=newsDesc
                newsList.append(newsDict)
    newsList
    

      

    3.生成所页列表页的url并获取全部新闻 :列表extend(列表) allnews

    allnews = []
    for i in range(2,12):
        listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
        allnews.extend(alist(listUrl))
        
    allnews  

    *每个同学爬学号尾数开始的10个列表页

    4.设置合理的爬取间隔

    import time
    import random
    random.random()*3
    

      

    5.用pandas做简单的数据处理并保存

    保存到csv或excel文件 

    newsdf.to_csv(r'F:duym爬虫gzccnews.csv')

    newsdf=pd.DataFrame(allnews)
    newsdf
    #排序 newsdf.sort_index(by=['newsClick'],ascending=False)

    #吧数据存到csv文件中
    newsdf.to_csv(r'F:hhaa.csv')

     

    保存到数据库

    import pandas
    with sqlite3.connect('gzccnewsdb.sqlite')as db:
        df2=pandas.read_sql_query('SELECT * FROM gzccnewsdb',con=db)

    #输出点击数超过385的新闻 df2[df2['newsClick']>385]

    完整代码:

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import sqlite3
    import pandas as pd
    import time
    import pandas
    import random
    
    #获取点击数
    def click(url):
        id=re.findall('d[1,5]',url)[-1]
        clickUrl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id)
        resClick=requests.get(clickUrl)
        newsClick=int(resClick.text.split('.html')[-1].lstrip("('").rstrip("');"))
        return newsClick

    #获取新闻时间 def newsdt(showinfo): newsDate=showinfo.split()[0].split(':')[1] newsTime=showinfo.split()[1] newsDT=newsDate+' '+newsTime dt=datetime.strptime(newsDT,'%Y-%m-%d %H:%M:%S') return dt
    #获取新闻信息 def anews(url): newsDetail={} res=requests.get(url) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') newsDetail['newsTitle']=soup.select('.show-title')[0].text showinfo=soup.select('.show-info')[0].text newsDetail['newsDT']=newsdt(showinfo) newsDetail['newsClick']=click(url) return newsDetail def alist(url): res=requests.get(listUrl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') newsList=[] for news in soup.select('li'): if len(news.select('.news-list-title'))>0: newsUrl=news.select('a')[0]['href'] newsDest=news.select('.news-list-description')[0].text newsDict=anews(newsUrl) newsDict['description']=newsDest newsList.append(newsDict) return newsList #url= 'http://news.gzcc.cn/html/2019/xiaoyuanxinwen_0404/11155.html' #anews(url) url = 'http://news.gzcc.cn/html/xiaoyuanxinwen' res=requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen') res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title'))>0: newsUrl=news.select('a')[0]['href'] allnews = []

    #获取新闻第2-第12页的所有新闻 for i in range(2,12): listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) allnews.extend(alist(listUrl)) pd.Series(allnews)

    #保存文件 newsdf=pd.DataFrame(allnews) newsdf.sort_index(by=['newsClick'],ascending=False) newsdf.to_csv(r'F:hhb.csv')

    #输出点击数大于385的新闻 with sqlite3.connect('gzccnewsdb.sqlite')as db: df2=pandas.read_sql_query('SELECT * FROM gzccnewsdb',con=db) df2[df2['newsClick']>385]

    结果如图:

                            输出到csv文件中的数据

      

    输出点击数超过385的新闻数据

  • 相关阅读:
    61. 最长不含重复字符的子字符串
    60. 礼物的最大价值 (未理解)
    59. 把数字翻译成字符串
    58. 把数组排成最小的数
    57. 数字序列中某一位的数字 (不懂)
    spring data jpa 官方文档
    idea 编译报错 源发行版 1.8 需要目标发行版 1.8
    idea maven 依赖报错 invalid classes root
    solr
    spring boot 官方文档
  • 原文地址:https://www.cnblogs.com/hongna/p/10671971.html
Copyright © 2011-2022 走看看