zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    def get_soup(url):
        req = requests.get(url)
        req.encoding = 'utf-8'
        soup = BeautifulSoup(req.text, 'html.parser')
        return soup
    
    def getDownNum(urls):
        pagename = urls.split('/')[-2].split('_')[0]
        html_id = (re.search('http://news.gzcc.cn/html/2018/'+pagename+'_(.*).html', urls).group(1).split('/')[-1])
        down_url = 'http://oa.gzcc.cn/api.php?op=count&id=' + html_id + '&modelid=80'
        reqd = requests.get(down_url)
        down_num = (re.search("('#hits').html('(.*)');", reqd.text).group(1))
        return down_num
    
    def getNewInfo(pageurl):
        soup = get_soup(pageurl)
        li_list = soup.select('li')
        title = list()
        a = list()
        info_list = list()
        con_list = list()
        cs = list()
        i=0
        for new in li_list:
            if(len(new.select('.news-list-text'))>0):
                title.append(new.select('.news-list-text')[0].select('.news-list-title')[0].text)
                a.append(new.a.attrs['href'])
                con_soup = get_soup(a[i])
                con_list.append(con_soup.select('#content')[0].text)
                info_list.append(con_soup.select('.show-info')[0].text.split("xa0xa0"))
                cs.append(''.join(con_list[i]))
                down_num = getDownNum(a[i])
                print('标题:' + title[i])
                print('链接:' + a[i])
                for j in range(len(info_list[i])):
                    if (len(info_list[i][j]) > 0 and info_list[i][j] != ' '):
                        if (j != len(info_list[i]) - 1):
                            print(info_list[i][j])
                        else:
                            print(info_list[i][j].rstrip(''), down_num, '')
                print(cs[i])
                i=i+1
    
    def getPageNum(url):
        newsoup = get_soup(url)
        return int(int(newsoup.select('.a1')[0].text.rstrip(''))/10)
    
    n = getPageNum('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    
    
    for i in range(0,n+2):
        if(i==0):
            getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/')
        else:
            getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/'+str(i)+'.html')
  • 相关阅读:
    Impala服务JVM崩溃问题
    Impala编译部署-6集群部署
    Impala编译部署-5单机部署-2
    Impala编译部署-5单机部署-1
    Impala编译部署-4
    Impala编译部署-3
    Impala编译部署-2
    Impala编译部署-1
    工作转向Kudu
    python 屏幕录制改进版,无opencv黑框显示
  • 原文地址:https://www.cnblogs.com/RE148/p/8795775.html
Copyright © 2011-2022 走看看