zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re

    #获取点击次数
    def getClickCount(newsUrl):
    newId=re.search('\_(.*).html',newsUrl).group(1).split('/')[1]
    clickUrl="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
    clickStr = requests.get(clickUrl).text
    count = re.search("hits').html('(.*)');", clickStr).group(1)
    return count

    #获取新闻详细信息
    def getNewsDetail(newsurl):
    resd=requests.get(newsurl)
    resd.encoding='utf-8'
    soupd=BeautifulSoup(resd.text,'html.parser')
    title=soupd.select('.show-title')[0].text
    info=soupd.select('.show-info')[0].text
    dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')

    if info.find('来源')>0:
    source =info[info.find('来源:'):].split()[0].lstrip('来源:')
    else:
    source='none'
    if info.find('作者:') > 0:
    author = info[info.find('作者:'):].split()[0].lstrip('作者:')
    else:
    author = 'none'
    click=getClickCount(newsurl)
    print(dt,click,author,newsurl,title,source)


    def getListPage(listPageUrl):
    res=requests.get(listPageUrl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    for news in soup.select('li'):
    if len(news.select('.news-list-title'))>0:
    a=news.select('a')[0].attrs['href']
    getNewsDetail(a)

    ListPageUrl="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res=requests.get(ListPageUrl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    n = int(soup.select('.a1')[0].text.rstrip('条'))//10+1

    getListPage(ListPageUrl)
    for i in range(n,n+1):
    listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    getListPage(listUrl)

  • 相关阅读:
    纯CSS星级评价
    Enterprise Library启用签名后发生 PublicKeyToken错误,HRESULT:0x80131040解决
    SQL Server
    该如何选择国外VPS
    网站的伪静态化
    kernel FIELD_SIZEOF宏 NULL地址不访问不出错
    Activity的四种加载模式
    Git magic 简短git使用命令集
    为什么包含多句代码的宏要用do while包括起来?
    使用lsof来查看FD和进程的关系
  • 原文地址:https://www.cnblogs.com/dengjinxiu/p/8798993.html
Copyright © 2011-2022 走看看