zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

     1 import requests
     2 from bs4 import BeautifulSoup
     3 from datetime import datetime
     4 import re
     5 # res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
     6 # res.encoding = 'utf-8'
     7 # soup = BeautifulSoup(res.text, 'html.parser')
     8 
     9 
    10 # 获取新闻点击次数
    11 def getNewsId(url):
    12     #使用正则表达式获得新闻编号
    13     newsId = re.findall(r'\_(.*).html', url)[0][-4:]
    14     #生成点击次数的Request URL
    15     clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
    16     clickRes = requests.get(clickUrl)
    17     # 利用正则表达式获取新闻点击次数
    18     clickCount = int(re.search("hits').html('(.*)');", clickRes.text).group(1))
    19     return clickCount
    20 
    21 
    22 
    23 def getNewDetail(newsurl):
    24     # 读取新闻详情
    25     resDescript = requests.get(newsurl)
    26     resDescript.encoding = "utf-8"
    27     soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
    28     title = soupDescript.select('.show-title')[0].text
    29     info = soupDescript.select('.show-info')[0].text
    30     if (info.find('作者') > 0):
    31         author = re.search('作者:((.{2,20}s|.{2,20}、|.{2,20},){1,5})', info).group(1)
    32     else:
    33         author = 'none'
    34     if (info.find('审核') > 0):
    35         right = re.search('审核:((.{2,20}s|.{2,20}、|.{2,20},){1,5})', info).group(1)
    36     else:
    37         right = 'none'
    38     if (info.find('来源') > 0):
    39         source = re.search('来源:((.{2,50}s|.{2,50}、|.{2,50},){1,5})', info).group(1)
    40     else:
    41         source = 'none'
    42     if (info.find('摄影') > 0):
    43         video = re.search('摄影:((.{2,50}s|.{2,50}、|.{2,50},){1,5})', info).group(1)
    44     else:
    45         video = 'none'
    46     # author = re.search('作者:((.{2,20}s|.{2,20}、|.{2,20},){1,5})', info).group(1)
    47     # right = re.search('审核:(.*)xa0xa0来源:', info).group(1)
    48     # source = re.search('来源:(.*)xa0xa0xa0xa0摄影:', info).group(1)
    49     # video = re.search('摄影:(.*)xa0xa0xa0xa0点击:', info).group(1)
    50     dt = datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')
    51     content = soupDescript.select('.show-content')[0].text.strip()
    52     click = getNewsId(newsurl)
    53     # print(click,title,newsurl,source,dt)
    54     print('发布时间:{0}
    作者:{1}
    审核:{2}
    来源:{3}
    摄影:{4}
    点击次数:{5}'.format(dt, author, right, source, video, click))
    55 
    56 def getListPage(listPageUrl):
    57     res1 = requests.get(listPageUrl)
    58     res1.encoding = 'utf-8'
    59     soup = BeautifulSoup(res1.text,'html.parser')
    60     for news in soup.select('li'):
    61         if len(news.select('.news-list-title'))>0:
    62             a = news.select('a')[0].attrs['href']
    63             getNewDetail(a)
    64 
    65 resn = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    66 resn.encoding = 'utf-8'
    67 soupn = BeautifulSoup(resn.text,'html.parser')
    68 #新闻总篇数
    69 listcount = int(soupn.select('.a1')[0].text.rstrip(''))
    70 print(listcount)
    71 #新闻总页数
    72 n = int(soupn.select('.a1')[0].text.rstrip(''))//10+1
    73 
    74 #首页
    75 # getListPage('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    76 
    77 #最后一页
    78 for i in range(n,n+1):
    79     pageUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    80     getListPage(pageUrl)

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。(爬取腾讯体育-NBA)

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import re
     4 import jieba
     5 
     6 
     7 def getnewsdetail(newsurl):
     8     resDescript = requests.get(newsurl)
     9     resDescript.encoding = "utf-8"
    10     soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
    11     content = soupDescript.select('.text')[0].text.strip()
    12     words = jieba.lcut(content)
    13     wcdict = {}
    14     for i in set(words):
    15         wcdict[i] = words.count(i)
    16         delete = {'', '', '', '', '已经', '', '', '', '没有', '', '他们', '', '', '什么', '一个',
    17                   '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    18                   '', '', '', '', '', '', '', '', '',  '', '', '', '', '', '', '', '', '', '',
    19                   '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
    20                   '', '', '', '', ' ', '', '-', '
    ', '', '', '', '', '', '', '', '', '', '.', '', ''}
    21     for i in delete:
    22         if i in wcdict:
    23             del wcdict[i]
    24     sort_word = sorted(wcdict.items(), key=lambda d: d[1], reverse=True)  # 排序
    25     for i in range(20):  # 输出
    26         print(sort_word[i])
    27 
    28 
    29 def getnewslist(newsurl):
    30     res = requests.get(newsurl)
    31     res.encoding = 'gbk'
    32     soup = BeautifulSoup(res.text, 'html.parser')
    33     for newsList in soup.select('.list01')[0].select('li'):
    34         title = newsList.select('a')[0].text
    35         newsurl = newsList.select('a')[0]['href']
    36         print('
    标题:{0}
    新闻链接:{1}
    '.format(title, newsurl))
    37         getnewsdetail(newsurl)
    38 
    39 
    40 url = "http://sports.qq.com/l/basket/original/qqinterview/list20150821155646.htm"
    41 resn = requests.get(url)
    42 resn.encoding = 'utf-8'
    43 soupn = BeautifulSoup(resn.text,'html.parser')
    44 getnewslist(url)
    45 
    46 for i in range(1, 30):
    47     if (i == 1):
    48         getnewslist(url)
    49     else:
    50         newsurl = "http://sports.qq.com/l/basket/original/qqinterview/list20150821155646_{}.htm".format(i)
    51         getnewslist(newsurl)
  • 相关阅读:
    spring mvc注解文件上传下载
    html,图片上传预览,input file获取文件等相关操作
    three.js、webGL、canvas区别于关联
    html添加新元素兼容和访问
    关于HTML,css3自适应屏幕,自适应宽度
    数据库设计的规则 入门
    mysql 索引入门
    一 .linux上安装 python git redis nginx
    一 .git和github
    一 .Django+Alipay(支付宝支付使用)和微信支付
  • 原文地址:https://www.cnblogs.com/chenguangpeng/p/8794238.html
Copyright © 2011-2022 走看看