zoukankan      html  css  js  c++  java
  • 爬取新闻列表

    • 获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。
      import requests
      from bs4 import BeautifulSoup
      import re
      
      url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
      res = requests.get(url_main)
      res.encoding = 'utf-8'
      
      soup = BeautifulSoup(res.text,'html.parser')
      li = soup.select('li')
      
      def gethits(url_1):
          li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
          hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
          return hits
      
      def getpageinfo(label):
          for title_list in label:
              if len(title_list.select('.news-list-title'))>0:
                  href = title_list.select('a')[0]['href']
                  title = title_list.select('.news-list-title')[0].text
                  time = title_list.select('span')[0].text
                  info = title_list.select('span')[1].text
      
                  res_list = requests.get(href)
                  res_list.encoding = 'utf-8'
                  soup_list = BeautifulSoup(res_list.text,'html.parser')
                  text_list = soup_list.select('.show-content')[0].text
      
                  hits_list = gethits(href)
      
                  print('时间:',time,'
      标题:',title,'
      链接:',href,'
      来源:',info,'
      点击次数:',hits_list,'
      ')
                  print('正文:',text_list)
                  break
      
      getpageinfo(li)

    • 获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数。
    import requests
    from bs4 import BeautifulSoup
    import re
    
    url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url_main)
    res.encoding = 'utf-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    li = soup.select('li')
    
    def gethits(url_1):
        li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
        hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
        return hits
    
    def getpageinfo(label):
        for title_list in label:
            if len(title_list.select('.news-list-title'))>0:
                href = title_list.select('a')[0]['href']
                title = title_list.select('.news-list-title')[0].text
                time = title_list.select('span')[0].text
                info = title_list.select('span')[1].text
    
                res_list = requests.get(href)
                res_list.encoding = 'utf-8'
                soup_list = BeautifulSoup(res_list.text,'html.parser')
                text_list = soup_list.select('.show-content')[0].text
    
                hits_list = gethits(href)
    
                print('时间:',time,'
    标题:',title,'
    链接:',href,'
    来源:',info,'
    点击次数:',hits_list,'
    ')
                print('正文:',text_list)
                
    getpageinfo(li)

     

    • 获取所有新闻列表页的网址,调用上述函数
    import requests
    from bs4 import BeautifulSoup
    import re
    
    url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url_main)
    res.encoding = 'utf-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    li = soup.select('li')
    
    def gethits(url_1):
        li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
        hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
        return hits
    
    def getpageinfo(label):
        for title_list in label:
            if len(title_list.select('.news-list-title'))>0:
                href = title_list.select('a')[0]['href']
                title = title_list.select('.news-list-title')[0].text
                time = title_list.select('span')[0].text
                info = title_list.select('span')[1].text
    
                res_list = requests.get(href)
                res_list.encoding = 'utf-8'
                soup_list = BeautifulSoup(res_list.text,'html.parser')
                text_list = soup_list.select('.show-content')[0].text
    
                hits_list = gethits(href)
    
    
    getpageinfo(li)
    
    pages = int(soup.select('.a1')[0].text.rstrip(''))//10+1
    
    for i in range(2,pages+1):
        url_page = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
        
        res_page = requests.get(url_page)
        res_page.encoding = 'utf-8'
    
        soup_page = BeautifulSoup(res_page.text,'html.parser')
        list_page = soup.select('li')
    
        getpageinfo(list_page)
        print(url_page)

    • 完成所有校园新闻的爬取工作。
    import requests
    from bs4 import BeautifulSoup
    import re
    
    url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url_main)
    res.encoding = 'utf-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    li = soup.select('li')
    
    def gethits(url_1):
        li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
        hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
        return hits
    
    def getpageinfo(label):
        for title_list in label:
            if len(title_list.select('.news-list-title'))>0:
                href = title_list.select('a')[0]['href']
                title = title_list.select('.news-list-title')[0].text
                time = title_list.select('span')[0].text
                info = title_list.select('span')[1].text
    
                res_list = requests.get(href)
                res_list.encoding = 'utf-8'
                soup_list = BeautifulSoup(res_list.text,'html.parser')
                text_list = soup_list.select('.show-content')[0].text
    
                hits_list = gethits(href)
    
                print('时间:',time,'
    标题:',title,'
    链接:',href,'
    来源:',info,'
    点击次数:',hits_list,'
    ')
                print('正文:',text_list)
                #break
    
    getpageinfo(li)
    
    pages = int(soup.select('.a1')[0].text.rstrip(''))//10+1
    
    for i in range(2,pages+1):
        url_page = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
        
        res_page = requests.get(url_page)
        res_page.encoding = 'utf-8'
    
        soup_page = BeautifulSoup(res_page.text,'html.parser')
        list_page = soup.select('li')
    
        getpageinfo(list_page)
        #break

  • 相关阅读:
    Codeforces Round #646 (Div. 2) B. Subsequence Hate(前缀和/思维)
    Codeforces Round #646 (Div. 2) A. Odd Selection(思维/分类讨论)
    “科林明伦杯”哈尔滨理工大学第十届程序设计竞赛 A.点对最大值(树的直径/树形DP)
    “科林明伦杯”哈尔滨理工大学第十届程序设计竞赛(同步赛)(ABCEFHJ)
    Codeforces Round #645 (Div. 2) C. Celex Update(思维)
    Codeforces Round #645 (Div. 2) D. The Best Vacation(二分+前缀和)
    Codeforces Round #645 (Div. 2) B. Maria Breaks the Self-isolation(贪心)
    Codeforces Round #645 (Div. 2) A. Park Lighting
    ORM之SQLALchemy
    python--10--mysql(2)
  • 原文地址:https://www.cnblogs.com/zeson/p/7652866.html
Copyright © 2011-2022 走看看