zoukankan      html  css  js  c++  java
  • 爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离

    import requests
    import re
    from bs4 import BeautifulSoup
    from datetime import datetime
    
    newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(newsurl)  # 返回response对象
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    
    def getNewDetail(newsUrl):
      for news in soup.select('li'):
        if len(news.select('.news-list-title'))>0:
            t=news.select('.news-list-title')[0].text  #标题
            a=news.select('a')[0].attrs['href'] #链接
            res = requests.get(a)
            res.encoding = 'utf-8'
            soupd = BeautifulSoup(res.text, 'html.parser')
            content = soupd.select('#content')[0].text
            description = news.select('.news-list-description')[0].text
            resd=requests.get(a)
            resd.encoding='utf-8'
            soupd=BeautifulSoup(resd.text,'html.parser')
            info=soupd.select('.show-info')[0].text
            d=info.lstrip('发布时间:')[:19]
            dt=datetime.strptime(d,'%Y-%m-%d %H:%M:%S')
            author=info[info.find('作者:'):].split()[0].lstrip('作者:')
            source=info[info.find('来源:'):].split()[0].lstrip('来源:')
            photo=info[info.find('摄影:'):].split()[0].lstrip('摄影:')
            print("新闻标题:",t)
            print("链接:",a)
            print("发布时间:",dt)
            print("作者:",author)
            print("来源:",source)
            print("摄影:",photo)
            print("描述:",description)
            getClickCount(a)
            print("正文:", content)
            break
    
        def getClickCount(newsUrl):
         clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80'
         count = requests.get(clickUrl).text.split('.html')[-1].lstrip("('").rstrip("');")
         print("点击次数:", count)
         re.match('http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html', newsUrl).group(1).split('/')[1]
         print('新闻编号:', re.search('\_(.*).html', newsUrl).group(1))
    
    getNewDetail(newsurl)
    

      

  • 相关阅读:
    vue @click.native和@click.stop和@click.self
    CSS改变图片颜色的filter(滤镜)属性
    iframe自适应内容高度
    python将两个列表对应成为字典
    Scrapy307重定向
    scrapy- invalid hostname: 'http'
    tensorflow2.0常用操作记录
    深度学习之Xavier初始化
    win10上tensorflow-gpu2.0安装完全指南
    如何使用Ubuntu/Linux系统远程连接Windows桌面
  • 原文地址:https://www.cnblogs.com/1996-yxl/p/8747213.html
Copyright © 2011-2022 走看看