zoukankan      html  css  js  c++  java
  • 爬取校园新闻首页的新闻

    import requests
    from bs4 import BeautifulSoup
    
    url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.news-list-title')) > 0:
            print(news.select('.news-list-title'))
            t=news.select('.news-list-title')[0].text
            dc=news.select('.news-list-info')[0].contents[0].text
            a=news.select('a')[0].attrs['href']
            print(dt,t,a)
    for news in soup.select('li'):
        if c(news.select('.news-list-title'))>0:
            t = news.select('.news-list-title')[0].text
            dc = news.select('.news-list-info')[0].contents[0].text
            a = news.select('a')[0].attrs['href']
            print(dc,t,a)
            res2 = requests.get(a)
            res2.encoding = 'utf-8'
            soup2 = BeautifulSoup(res2.text, 'html.parser')
            t1 = soup2.select('#content')[0].text
            print(t1)
    
            ifd = soup2.select('.show-info')[0].text
            dt2 = ifd.lstrip('发布时间:')[:19]
            print(dt2)
    
            i = ifd.find('作者:')
            if i>0:
                s = ifd[ifd.find('作者:'):].split()[0].lstrip('作者:')
                print(s)
    
            q = ifd.find('来源:')
            if q > 0:
                b = ifd[ifd.find('来源:'):].split()[0].lstrip('来源:')
                print(b)
    
            c = ifd.find('摄影:')
            if c > 0:
                n = ifd[ifd.find('摄影:'):].split()[0].lstrip('摄影:')
                print(n)
    
            dtn = datetime.strptime(dt2,'%Y-%m-%d %H:%M:%S')
            print(dtn)
            break
    for news in soup.select('li'):
        if len(news.select('.news-list-title'))>0:
            title = news.select('.news-list-title')[0].text
            url = news.select('a')[0]['href']
            time = news.select('.news-list-info')[0].contents[0].text
            dt = datetime.strptime(time,'%Y-%m-%d')
            source = news.select('.news-list-info')[0].contents[1].text
            print(dt,'
    ',title,'
    ',url,'
    ',source,'
    ')

  • 相关阅读:
    Bit Manipulation
    218. The Skyline Problem
    Template : Two Pointers & Hash -> String process
    239. Sliding Window Maximum
    159. Longest Substring with At Most Two Distinct Characters
    3. Longest Substring Without Repeating Characters
    137. Single Number II
    142. Linked List Cycle II
    41. First Missing Positive
    260. Single Number III
  • 原文地址:https://www.cnblogs.com/129lai/p/8696451.html
Copyright © 2011-2022 走看看