zoukankan      html  css  js  c++  java
  • 爬取校园新闻首页的新闻

    
    

    1. 用requests库和BeautifulSoup库,爬取校园新闻首页新闻的标题、链接、正文、show-info。

    2. 分析info字符串,获取每篇新闻的发布时间,作者,来源,摄影等信息。

    import requests
    import string
    newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(newsurl) #返回response对象
    res.encoding='utf-8'
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(res.text,'html.parser')

    def getNewsId(url):
    #使用正则表达式获得新闻编号
    newsId = re.findall(r'\_(.*).html', url)[0][-4:]
    #生成点击次数的Request URL
    clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
    clickRes = requests.get(clickUrl)
    # 利用正则表达式获取新闻点击次数
    clickCount = int(re.search("hits').html('(.*)');", clickRes.text).group(1))
    return clickCount

    
    
    
    
    

    def getNewDetail(newsUrl):
    # 读取新闻详情
    resDescript = requests.get(newsUrl)
    resDescript.encoding = "utf-8"
    soupDescript = BeautifulSoup(resDescript.text, 'html.parser')

    
    

    content = soupDescript.select(".show-content")[0].text # 正文
    info = soupDescript.select(".show-info")[0].text # info相关内容
    time = re.search("发布时间:(.*) xa0xa0 xa0xa0作者:", info).group(1)
    author = re.search("作者:(.*)xa0xa0审核:", info).group(1)
    right = re.search("审核:(.*)xa0xa0来源:", info).group(1)
    resource = re.search('来源:(.*)xa0xa0xa0xa0摄影:', info).group(1)
    video = re.search("摄影:(.*)xa0xa0xa0xa0点击:", info).group(1)
    # 调用getNewsId()获取点击次数
    count = getNewsId(newsUrl)
    # 用datetime将时间字符串转换为datetime类型
    dateTime = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')

    
    

    print('标题' + ': ' + title)
    print('概要' + ': ' + description)
    print('链接' + ': ' + a)
    print('正文' + ' :' + content)
    print('发布时间:{0} 作者:{1} 审核:{2} 来源:{3} 摄影:{4} 点击次数:{5}'.format(dateTime, author, right, resource, video,count))
    print(" ")

    
    
    # print(soup)
    # for i in soup.select('.news-list-title'):
    #     print(i)
    # for k in soup.select('.news-list-description'):
    #     print(k)
    # for j in soup.find_all('a'):
    #  print(j['href'])
    # print(soup.select('li')[10].a.attrs['href'])
    for news in soup.select('li'):
    if len(news.select('.news-list-title'))>0:
    t=news.select('.news-list-title')[0].text
    d=news.select('.news-list-description')[0].text
    a=news.a.attrs['href']
    res = requests.get(a) # 返回response对象
    res.encoding = 'utf-8'
    soupd = BeautifulSoup(res.text, 'html.parser')
    c=soupd.select('#content')[0].text
    s=soupd.select('.show-info')[0].text
    date=s.lstrip('发布时间:')[:19]
    au=s[s.find('作者:'):].split()[0].lstrip('作者:')
    print(date,t,a,au)
    break
  • 相关阅读:
    转载Typora学习笔记
    SpringMVC整体接受前台参数
    基于SSM框架实现oa项目
    Don't know how to iterate over supplied "items" in <forEach>
    springMVC自定义类型转换器(date类型转换)
    springMVC异常处理
    linux服务器基础运维
    什么是服务器
    mysql binlog日志 恢复数据,mysqldump备份
    nginx LVS Haproxy
  • 原文地址:https://www.cnblogs.com/swxvico/p/8707519.html
Copyright © 2011-2022 走看看