zoukankan      html  css  js  c++  java
  • 爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离

    import requests
    import re
    from bs4 import BeautifulSoup
    from datetime import datetime
    
    newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(newsurl)  # 返回response对象
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    
    def getNewDetail(newsUrl):
      for news in soup.select('li'):
        if len(news.select('.news-list-title'))>0:
            t=news.select('.news-list-title')[0].text  #标题
            a=news.select('a')[0].attrs['href'] #链接
            res = requests.get(a)
            res.encoding = 'utf-8'
            soupd = BeautifulSoup(res.text, 'html.parser')
            content = soupd.select('#content')[0].text
            description = news.select('.news-list-description')[0].text
            resd=requests.get(a)
            resd.encoding='utf-8'
            soupd=BeautifulSoup(resd.text,'html.parser')
            info=soupd.select('.show-info')[0].text
            d=info.lstrip('发布时间:')[:19]
            dt=datetime.strptime(d,'%Y-%m-%d %H:%M:%S')
            author=info[info.find('作者:'):].split()[0].lstrip('作者:')
            source=info[info.find('来源:'):].split()[0].lstrip('来源:')
            photo=info[info.find('摄影:'):].split()[0].lstrip('摄影:')
            print("新闻标题:",t)
            print("链接:",a)
            print("发布时间:",dt)
            print("作者:",author)
            print("来源:",source)
            print("摄影:",photo)
            print("描述:",description)
            getClickCount(a)
            print("正文:", content)
            break
    
        def getClickCount(newsUrl):
         clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80'
         count = requests.get(clickUrl).text.split('.html')[-1].lstrip("('").rstrip("');")
         print("点击次数:", count)
         re.match('http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html', newsUrl).group(1).split('/')[1]
         print('新闻编号:', re.search('\_(.*).html', newsUrl).group(1))
    
    getNewDetail(newsurl)
    

      

  • 相关阅读:
    ssh图示+hibernate图示
    spring Transactional
    Spring datasource
    sqlloader导入数据
    Spring Aop Annotation(@Pointcut)
    ajax传输文件+检验
    Spring Aop Annotation
    JDK的动态代理
    nginx代理gitlab
    python相关
  • 原文地址:https://www.cnblogs.com/1996-yxl/p/8747213.html
Copyright © 2011-2022 走看看