zoukankan      html  css  js  c++  java
  • 爬虫大作业

    1.选一个自己感兴趣的主题(所有人不能雷同)。

    2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

    3.对爬了的数据进行文本分析,生成词云。

    4.对文本分析结果进行解释说明。

    5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

    6.最后提交爬取的全部数据、爬虫及数据分析源代码。

    本次选的主题是通过爬取雷锋网业界模块的所有新闻的作者,统计出哪个作者在这里发表文章最多,

    1.首先,要获取到每条新闻的url,代码如下

    def getNewsUrl(newsurl):
        res = requests.get(newsurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        list = soup.select('.list')[0].select('li')
        allnewsList = []
        for news in list:
            newsList = []
            NewUrl = news.select('.img')[0].select('a')[1].attrs['href']
            newsList=getListPage(NewUrl)
            print(newsList)
            allnewsList.append(newsList)
        return allnewsList
    

    2.通过上面的url,获取到每条新闻的详细信息,代码如下

    def getListPage(NewUrl):
        res = requests.get(NewUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        list = soup.select('.article-template')
        newList =[]
    
        new = {}
        new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip()
        new['作者'] = soup.select('.article-template')[0].select('a')[0].text
    
        new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip()
        new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip()
        a = soup.select('.article-template')[0].select('p')
            # print(new)
        newList.append(new)
        for i in range(0,len(a)-2):
            content = a[i].text;
        title = soup.select('.article-template')[0].select('a')[0].text
        writeNewsContent(title)
    
        return newList
    

    3.通过获取总页数

    def getPageN():
        res = requests.get('https://www.leiphone.com/category/sponsor')
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, 'html.parser')
        n = int(soup.select('.pages')[0].select('a')[4].text)
        return n
    
    n=getPageN();
    

    4.获取所有新闻的信息

    allnewList = []
    for i in range(1,n+1):
        newsList = []
        newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i)
        print(newsurl)
        newsList = getNewsUrl(newsurl)
        allnewList.append(newsList)
    

    5.把所有新闻的作者都写入到txt文档

    def writeNewsContent(title):
        f=open('gzccNews.txt','a',encoding='utf-8')
        f.write(title)
        f.close()
    

    6.对txt文档里的信息进行统计

    import jieba
    
    f = open('gzccNews.txt', 'r', encoding='utf-8')
    text = f.read()
    f.close()
    jieba.add_word('归〇')
    jieba.add_word('新智造')
    jieba.add_word('刘芳平')
    jieba.add_word('吕倩')
    jieba.add_word('木子')
    jieba.add_word('李诗')
    jieba.add_word('Jennings_Zhu')
    jieba.add_word('包永刚')
    jieba.add_word('王金许')
    jieba.add_word('李赓')
    jieba.add_word('Dude')
    jieba.add_word('温晓桦')
    jieba.add_word('李雨晨')
    jieba.add_word('思颖')
    jieba.add_word('李智勇')
    jieba.add_word('咲甜')
    jieba.add_word('陈伊莉')
    jieba.add_word('彭赛琼')
    jieba.add_word('camel')
    jieba.add_word('赵青晖')
    jieba.add_word('Alter')
    jieba.add_word('聊IT')
    jieba.add_word('大公司日报')
    jieba.add_word('又田')
    jieba.add_word('跃斌')
    jieba.add_word('奕欣')
    jieba.add_word('张驰')
    
    
    punctuation = ''',。‘’“”:;()!?、 '''
    a = {'
    ','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'}
    for i in punctuation:
        text = text.replace(i,'')
    
    tempwords = list(jieba.cut(text))
    
    count = {}
    words = list(set(tempwords) - a)
    
    
    for i in range(0, len(words)):
        count[words[i]] = text.count(str(words[i]))
    
    countList = list(count.items())
    countList.sort(key=lambda x: x[1], reverse=True)
    print(countList)
    
    f = open('b.txt', 'a')
    for i in range(20):
        f.write(countList[i][0] + ':' + str(countList[i][1]) + '
    ')
    f.close()
    

    遇到问题及解决办法

    1.在爬取数据的时候没有什么大的难度,就是有时候会出现类似于这个网站的通知,解决方法是吧所有相同的元素用一个链表包起来,然后删掉那些通知所在的元素给删掉

    2.因为对词云库的安装方法不是很了解,网上看到资料也是五花八门,所以在这次的作业里,我依旧是延续了用jieba统计出发表文章最多的20个作者

    结论:

    通过对统计结果进行分析,发现木子发表的文章最多,所以我们可以得出这样一个结论,这位作者可能对业界有着很多独特见解,所以我们可以通过阅读他的文章去认识他的见解

    爬取的全部数据、爬虫及数据分析源代码:

    new.py

    import requests
    import re
    from datetime import datetime
    from bs4 import BeautifulSoup
    import openpyxl
    import pandas
    
    
    
    
            # print(NewUrl)
    # getNewsUrl('https://www.leiphone.com/category/sponsor/page/1')
    
    def writeNewsContent(title):
        f=open('gzccNews.txt','a',encoding='utf-8')
        f.write(title)
        f.close()
    #获取新闻详情
    def getListPage(NewUrl):
        res = requests.get(NewUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        list = soup.select('.article-template')
        newList =[]
    
        new = {}
        new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip()
        new['作者'] = soup.select('.article-template')[0].select('a')[0].text
    
        new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip()
        new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip()
        a = soup.select('.article-template')[0].select('p')
            # print(new)
        newList.append(new)
        for i in range(0,len(a)-2):
            content = a[i].text;
        title = soup.select('.article-template')[0].select('a')[0].text
        writeNewsContent(title)
    
        return newList
    
    #获取详情url
    def getNewsUrl(newsurl):
        res = requests.get(newsurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        list = soup.select('.list')[0].select('li')
        allnewsList = []
        for news in list:
            newsList = []
            NewUrl = news.select('.img')[0].select('a')[1].attrs['href']
            newsList=getListPage(NewUrl)
            print(newsList)
            allnewsList.append(newsList)
        return allnewsList
    
    #获取总页数
    def getPageN():
        res = requests.get('https://www.leiphone.com/category/sponsor')
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, 'html.parser')
        n = int(soup.select('.pages')[0].select('a')[4].text)
        return n
    
    n=getPageN();
    
    allnewList = []
    for i in range(1,n+1):
        newsList = []
        newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i)
        print(newsurl)
        newsList = getNewsUrl(newsurl)
        allnewList.append(newsList)
    

    jieba.py  

    import jieba
    
    f = open('a.txt', 'r', encoding='utf-8')
    text = f.read()
    f.close()
    jieba.add_word('归〇')
    jieba.add_word('新智造')
    jieba.add_word('刘芳平')
    jieba.add_word('吕倩')
    jieba.add_word('木子')
    jieba.add_word('李诗')
    jieba.add_word('Jennings_Zhu')
    jieba.add_word('包永刚')
    jieba.add_word('王金许')
    jieba.add_word('李赓')
    jieba.add_word('Dude')
    jieba.add_word('温晓桦')
    jieba.add_word('李雨晨')
    jieba.add_word('思颖')
    jieba.add_word('李智勇')
    jieba.add_word('咲甜')
    jieba.add_word('陈伊莉')
    jieba.add_word('彭赛琼')
    jieba.add_word('camel')
    jieba.add_word('赵青晖')
    jieba.add_word('Alter')
    jieba.add_word('聊IT')
    jieba.add_word('大公司日报')
    jieba.add_word('又田')
    jieba.add_word('跃斌')
    jieba.add_word('奕欣')
    jieba.add_word('张驰')
    
    
    punctuation = ''',。‘’“”:;()!?、 '''
    a = {'
    ','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'}
    for i in punctuation:
        text = text.replace(i,'')
    
    tempwords = list(jieba.cut(text))
    
    count = {}
    words = list(set(tempwords) - a)
    
    
    for i in range(0, len(words)):
        count[words[i]] = text.count(str(words[i]))
    
    countList = list(count.items())
    countList.sort(key=lambda x: x[1], reverse=True)
    print(countList)
    
    f = open('b.txt', 'a')
    for i in range(20):
        f.write(countList[i][0] + ':' + str(countList[i][1]) + '
    ')
    f.close()
    

      

  • 相关阅读:
    tcp socket http(复制的)
    cas php
    占用字节
    网络基础
    Mbps MB/S Mb/s
    path_info和get_full_path()的区别
    rbac权限管理
    ORM _meta
    Django SQLite3的使用
    url的分发
  • 原文地址:https://www.cnblogs.com/cktcom/p/8974768.html
Copyright © 2011-2022 走看看