zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    
    def getClick(newsUrl):
        newId = re.search('\_(.*).html', newsUrl).group(1).split('/')[1]
        click = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newId))
        return click.text.split('.html')[-1].lstrip("('").rstrip("');")
    
    
    def getNews(newsUrl):
        resd = requests.get(newsUrl)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')
        title = soupd.select('.show-title')[0].text
        info = soupd.select('.show-info')[0].text
        dt = datetime.strptime(info.lstrip('发布时间:')[0:19], '%Y-%m-%d %H:%M:%S')
        if (info.find('作者:') > 0):
            author = re.search('作者:((.{2,4}s|.{2,4}、){1,3})', info).group(1)
        if (info.find('审核:') > 0):
            check = re.search('审核:((.{2,4}s){1,3})', info).group(1)
        if (info.find('来源:') > 0):
            sources = re.search('来源:((.{2,50}s|.{2,50}、|.{2,50},){1,5})', info).group(1)
        content = soupd.select('.show-content')[0].text.strip()
        click = getClick(newsUrl)
        print(click,title,newsUrl,author,check,sources,dt)
    
    
    def getListPage(listPageUrl):
        res=requests.get(listPageUrl)
        res.encoding='utf-8'
        soup=BeautifulSoup(res.text,'html.parser')
        for news in soup.select('li'):
            if len(news.select('.news-list-title')) > 0:
                a = news.select('a')[0].attrs['href']
                getNews(a)
    
    # listPageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
    resn = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    resn.encoding = 'utf-8'
    soupn = BeautifulSoup(resn.text,'html.parser')
    n = int(soupn.select('.a1')[0].text.rstrip(''))//10+1
    for i in range(2,3):
        listPageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
        getListPage(listPageUrl)

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    我选择的是我玩的手游官网公告,因为公告有图片而我爬的只有文字所以会有大片空白

    import requests, re, jieba
    from bs4 import BeautifulSoup
    from datetime import datetime
    
    # 获取新闻细节
    def getNewsDetail(newsUrl):
        resd = requests.get(newsUrl)
        resd.encoding = 'gb2312'
        soupd = BeautifulSoup(resd.text, 'html.parser')
        print(soupd.text)
        content = soupd.select('.artText')[0].text
        info = soupd.select('.artDate')[0].text
        title = soupd.select('h1')[0].text
        date = re.search('(d{4}.d{2}.d{2})', info).group(1)  # 识别时间格式
        dateTime = datetime.strptime(date, '%Y-%m-%d')  # 用datetime将时间字符串转换为datetime类型
        keyWords = getKeyWords(content)
        print('标题:{}'.format(title))
        print('发布时间:{0}'.format(dateTime))
        print('关键词:{}、{}、{}'.format(keyWords[0], keyWords[1], keyWords[2]))
        print(content)
    
    # 通过jieba分词,获取新闻关键词
    def getKeyWords(content):
        content = ''.join(re.findall('[u4e00-u9fa5]', content))  # 通过正则表达式选取中文字符数组,拼接为无标点字符内容
        wordSet = set(jieba._lcut(content))
        wordDict = {}
        for i in wordSet:
            wordDict[i] = content.count(i)
        deleteList, keyWords = [], []
        for i in wordDict.keys():
            if len(i) < 2:
                deleteList.append(i)  # 去掉单字无意义字符
        for i in deleteList:
            del wordDict[i]
        dictList = list(wordDict.items())
        dictList.sort(key=lambda item: item[1], reverse=True)  # 排序,返回前三关键字
        for i in range(3):
            keyWords.append(dictList[i][0])
        return keyWords
    
    # 获取一页的新闻
    def getListPage(listUrl):
        res = requests.get(listUrl)
        res.encoding = 'gbk'
        soup = BeautifulSoup(res.text, 'html.parser')
        for new in soup.select('.txt-list'):
            newsUrl = new.select('a')[0]['href']
            t = 'http:'+newsUrl
            print(t)
    
    
    
    
            getNewsDetail(t)  # 调用getNewsDetail()获取新闻详情
            break  # z只获取单个新闻,若要获取整页则去掉break
    
    for i in range(1, 31):
        listUrl = 'http://stzb.163.com/news/index_{}.html'.format(i)
        getListPage(listUrl)

  • 相关阅读:
    Android 2.2 r1 API 中文文档系列(11) —— RadioButton
    Android API 中文 (15) —— GridView
    Android 中文 API (16) —— AnalogClock
    Android2.2 API 中文文档系列(7) —— ImageButton
    Android2.2 API 中文文档系列(6) —— ImageView
    Android 2.2 r1 API 中文文档系列(12) —— Button
    Android2.2 API 中文文档系列(8) —— QuickContactBadge
    [Android1.5]TextView跑马灯效果
    [Android1.5]ActivityManager: [1] Killed am start n
    Android API 中文(14) —— ViewStub
  • 原文地址:https://www.cnblogs.com/170he/p/8798618.html
Copyright © 2011-2022 走看看