zoukankan      html  css  js  c++  java
  • 爬取全部校园新闻

    import requests
    from  bs4 import  BeautifulSoup
    from datetime import datetime
    import locale
    import re
    locale.setlocale(locale.LC_CTYPE,'chinese')
    
    def getClickCount(newsUrl):
        newsId = re.findall('\_(.*).html', newsUrl)[0].split('/')[1]   #使用正则表达式取得新闻编号
        clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
        clickStr = requests.get(clickUrl).text
        return(re.search("hits').html('(.*)');",clickStr).group(1))
    
    def getNewDetail(newsUrl):
        resd = requests.get(newsUrl)  # 返回response
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')
        print('标题:' + soupd.select('.show-title')[0].text)
        # print('描述:' + soupd.select('.show-description')[0].text)
        print('链接:' + newsUrl)
        info = soupd.select('.show-info')[0].text
        time = re.search('发布时间:(.*) xa0xa0 xa0xa0作者:', info).group(1)
        dtime = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
        print('发布时间:{}'.format(dtime))
        print('作者:' + re.search('作者:(.*)审核:', info).group(1))
        # print('审核:' + re.search('审核:(.*)来源:', info).group(1))
        # print('来源:' + re.search('来源:(.*)摄影:', info).group(1))
        # print('摄影:' + re.search('摄影:(.*)点击', info).group(1))
        clickCount = getClickCount(newsUrl)
        print('点击次数:' + clickCount)
        # print(soupd.select('.show-content')[0].text)
    
    def getListPage(ListPageUrl):
        res = requests.get(ListPageUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text,'html.parser')
        # print(soup.select('li'))
        for news in soup.select('li'):
            if len(news.select('.news-list-title'))>0:
                a = news.a.attrs['href']
                getNewDetail(a)
    
    firstUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    print('第1页:')
    getListPage(firstUrl)
    res = requests.get(firstUrl)
    res.encoding = 'utf-8'
    soupn = BeautifulSoup(res.text,'html.parser')
    n =  int(soupn.select('.a1')[0].text.rstrip(''))//10+1
    
    for i in range(2,n):
        pageUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
        print('第{}页:'.format(i))
        getListPage(pageUrl)
        break


    import requests
    from bs4 import BeautifulSoup
    def getHTMLText(url):
        try:
            r = requests.get(url, timeout = 30)
            r.raise_for_status()
            return r.text
        except:
            return ""
    def getContent(url):
        html = getHTMLText(url)
        soup = BeautifulSoup(html, "html.parser")
        title = soup.select("div.hd > h1")
        print(title[0].get_text())
        time = soup.select("div.a_Info > span.a_time")
        print(time[0].string)
        author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor")
        print(author[0].get_text())
        paras = soup.select("div.Cnt-Main-Article-QQ > p.text")
        for para in paras:
            if len(para) > 0:
                print(para.get_text())
                print()
        #写入文件
        fo = open("text.txt", "w+")
        fo.writelines(title[0].get_text() + "
    ")
        fo.writelines(time[0].get_text() + "
    ")
        for para in paras:
            if len(para) > 0:
                fo.writelines(para.get_text() + "
    
    ")
        fo.writelines(author[0].get_text() + '
    ')
        fo.close()
        article = {
            'Title' : title[0].get_text(),
            'Time' : time[0].get_text(),
            'Paragraph' : paras,
            'Author' : author[0].get_text()
        }
        print(article)
    def main():
        url = "http://news.qq.com/a/20170504/012032.htm"
        getContent(url);
    main()
  • 相关阅读:
    SQL生成上百万条数据 及分页 长沙
    aspx或ashx里面多个方法 进行ajax调用 长沙
    C# IO操作,文件 文件夹 长沙
    Reperter多层嵌套 长沙
    解读WPF中事件
    WPF 神话之Binding对象二
    WPF 神话之Binding对象一
    明白就好
    导出Execl和读取Execl文件
    新加入博客园,嘿嘿
  • 原文地址:https://www.cnblogs.com/lk666/p/8798655.html
Copyright © 2011-2022 走看看