zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup
    
    url = 'http://news.sise.edu.cn/cms/6145.html'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.list-unstyled list-inline')):
            break
    print(news)
    def writeNewsDetail(content):
        f = open('News.txt', 'a',encoding='utf-8')
        f.write(content)
        f.close()
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re

    获取网址

    def getClickCount(newsUrl):
        newId = re.search('cms(.*).html', newsUrl).group(1).split('/')[1]
        url = 'http://news.sise.edu.cn/cms/{}.html'
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        return (int(requests.get(url).text.split('.html')[-1].lstrip("('").rstrip("');")))

    新闻内容

    def getNewsDetail(newsUrl):  #一篇新闻的全部信息
        resd = requests.get(newsUrl)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')  # 打开新闻详情页并解析
    
        news ={}
        news['title'] = soupd.select('.text-muted-5')[0].text
        info = soupd.select('.list-unstyled list-inline')
        for infos in info:
            news['dt'] = datetime.strptime(info.xpath('li[5]'), '%H:%M:%S %Y-%m-%d')
            news['content'] = soupd.select('.MsoNormal')[0].text.strip()
            #writeNewsDetail(news['content'])
            news['click'] = soupd.select('li[6]')
            news['newsUrl'] = newsUrl
            return(news)

    全部新闻列表

    def getListPage(pageUrl):  #一个列表页的全部新闻
        res = requests.get(pageUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
    
        newsList = []
        for news in soup.select('li'):
            if len(news.select('.media-body')) > 0:
                newsUrl = news.select('a')[0].attrs['href']  # 链接
                newsList.append(getNewsDetail(newsUrl))
        return(newsList)
    
    def getPageN():
        res = requests.get('http://news.sise.edu.cn/cms/news/2.html')
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        n = int(soup.select('.a1')[0].text.rstrip(''))
        return (n // 12 + 1)
    
    newsTotal = []
    firstPageUrl = 'http://news.sise.edu.cn/cms/news/2.html'
    newsTotal.extend(getListPage(firstPageUrl))
    
    n = getPageN()
    for i in range(n, n+1):
        listPageUrl = 'http://news.sise.edu.cn/cms/news/2/p/{}.html'.format(i)
        newsTotal.extend(getListPage(listPageUrl))
  • 相关阅读:
    基于视网膜虹膜识别的内容分级系统
    C# 反射详解一
    C# 委托浅析
    .Net Core中使用Dapper构建泛型仓储
    C# 泛型详解
    非对称可逆加密RSA
    对称可逆加密Des
    .NET Core 3.0 中间件 Middleware
    .NET Core3.0 日志 logging
    .Net Core3.0依赖注入DI
  • 原文地址:https://www.cnblogs.com/0056a/p/8987698.html
Copyright © 2011-2022 走看看