zoukankan      html  css  js  c++  java
  • 爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    
    url = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url);
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser");
    
    for news in soup.select("li"):
        if len(news.select(".news-list-title")) > 0:  # 排除为空的li
            time = news.select(".news-list-info")[0].contents[0].text
            title = news.select(".news-list-title")[0].text
            description = news.select(".news-list-description")[0].text
            a = news.select('a')[0].attrs['href']
            detail_res = requests.get(a)
            detail_res.encoding = "utf-8"
            detail_soup = BeautifulSoup(detail_res.text, "html.parser")
            print(detail_soup.select("#content")[0].text)  # 正文
    
            print(time, title, description, a)
    
            content = detail_soup.select("#content")[0].text
            info = detail_soup.select(".show-info")[0].text
            date_time = info.lstrip('发布时间:')[:19]
            print(info)
            break
    
    info = '发布时间:2018-04-01 11:57:00      作者:陈流芳  审核:权麟春  来源:马克思主义学院      点击:次'
    detail_time = info.lstrip('发布时间:')[:19]
    sh = info[info.find("审核"):].split()[0].lstrip('审核:')
    print(detail_time, sh)
    info1 = '发布时间:2018-04-01 11:57:00      作者:陈流芳 许健杰  审核:权麟春   来源:马克思主义学院    点击:次 '
    info1 = info1[info1.find("作者"):info1.find('审核:')].lstrip('作者:').split()[1]
    print(info1)
    now_time = datetime.now();
    now_time.year
    print(datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S"))
    print(now_time.strftime('%Y\%m\%d'))
  • 相关阅读:
    poj 2312 Battle City
    poj 2002 Squares
    poj 3641 Pseudoprime numbers
    poj 3580 SuperMemo
    poj 3281 Dining
    poj 3259 Wormholes
    poj 3080 Blue Jeans
    poj 3070 Fibonacci
    poj 2887 Big String
    poj 2631 Roads in the North
  • 原文地址:https://www.cnblogs.com/SOLARLKS/p/8719263.html
Copyright © 2011-2022 走看看