zoukankan      html  css  js  c++  java
  • 信息领域热词分类分析04(由热词爬取相关新闻)

    (1) 项目名称:信息化领域热词分类分析及解释

    (2) 功能设计:
    1)
    数据采集:要求从定期自动从网络中爬取信息领域的相关热
    词;
    2)
    数据清洗:对热词信息进行数据清洗,并采用自动分类技术
    生成信息领域热词目录,;
    3)
    热词解释:针对每个热词名词自动添加中文解释(参照百度
    百科或维基百科);
    4)
    热词引用:并对近期引用热词的文章或新闻进行标记,生成
    超链接目录,用户可以点击访问;
    5)
    数据可视化展示:
    ① 用字符云或热词图进行可视化展示;
    ② 用关系图标识热词之间的紧密程度。6) 数据报告:可将所有热词目录和名词解释生成 WORD 版报告
    形式导出。

    今天将之前的片段的方法总结到一起来使用,

    import requests
    from bs4 import BeautifulSoup
    import pymysql
    import json
    import lxml
    import xlwt
    import jieba
    import pandas as pd
    import re
    from collections import Counter
    import linecache
    from lxml import etree
    def getTitle(url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}  # 创建头部信息
        response = requests.get(url, headers=headers)  # 发送网络请求
        content = response.content.decode('utf-8')
        soup = BeautifulSoup(content, 'html.parser')
        list=soup.select('div:nth-child(2) > h2:nth-child(1) > a:nth-child(1)')
        for i in range(18):
            print(list[i].text)
        return list
    def getHotword():
        url = "https://news.cnblogs.com/n/recommend?page={}"
        f = xlwt.Workbook(encoding='utf-8')
        ft = open("Hotword.txt", "w", encoding='utf-8')
        sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
        sheet01.write(0, 0, '博客最热新闻')  # 第一行第一列
        temp = 0
        for i in range(1, 100):
            newurl = url.format(i)
            title = getTitle(newurl)
            for j in range(len(title)):
                ft.write(title[j].text +'
    ')
                sheet01.write(temp + j + 1, 0, title[j].text)
            temp += len(title)
            print("" + str(i) + "页打印完!")
        print("全部打印完!!!")
        f.save('Hotword.xls')
        ft.close()
    def fenci():
        filehandle = open("Hotword.txt", "r", encoding='utf-8');
        file = open("final_hotword2.txt", "w", encoding='utf-8');
        filepaixu = open("final_hotword.txt", "w", encoding='utf-8');
        mystr = filehandle.read()
        dbinserthot=[]
        seg_list = jieba.cut(mystr)  # 默认是精确模式
        print(seg_list)
        # all_words = cut_words.split()
        # print(all_words)
        stopwords = {}.fromkeys([line.rstrip() for line in open(r'final.txt', encoding='UTF-8')])
        c = Counter()
        for x in seg_list:
            if x not in stopwords:
                if len(x) > 1 and x != '
    ' and x != 'quot':
                    c[x] += 1
    
        print('
    词频统计结果:')
        for (k, v) in c.most_common(100):  # 输出词频最高的前两个词
            print("%s:%d" % (k, v))
            file.write(k + '
    ')
            filepaixu.write(k + ":" + str(v) + '
    ')
            value=[k,str(v)]
            dbinserthot.append(value)
        tuphot=tuple(dbinserthot)
        db = pymysql.connect(host="localhost", user="root", password="1229", database="lianxi", charset='utf8')
        cursor = db.cursor()
        sql_hot = "INSERT INTO final_hotword values(%s,%s)"
        try:
            cursor.executemany(sql_hot, tuphot)
            db.commit()
        except:
            print('执行失败,进入回调3')
            db.rollback()
        db.close()
        # print(mystr)
        filehandle.close();
        file.close()
        filepaixu.close()
        # seg2 = jieba.cut("好好学学python,有用。", cut_all=False)
        # print("精确模式(也是默认模式):", ' '.join(seg2))
    def get_page(url):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
        }
        try:
            response = requests.get(url,headers=headers)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                print('获取网页成功')
                #print(response.encoding)
                return response.text
            else:
                print('获取网页失败')
        except Exception as e:
            print(e)
    def getHotExpeain():
        url = 'https://baike.baidu.com/'
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
        }
        f = xlwt.Workbook(encoding='utf-8')
        sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
        sheet01.write(0, 0, '热词')  # 第一行第一列
        sheet01.write(0, 1, '热词解释')  # 第一行第二列
        sheet01.write(0, 2, '网址')  # 第一行第三列
        fopen = open('final_hotword2.txt', 'r', encoding='utf-8')
        lines = fopen.readlines()
        urls = ['https://baike.baidu.com/item/{}'.format(line) for line in lines]
        i = 0
        alllist = []
        value = ()
        for url in urls:
            print(url.replace("
    ", ""))
            page = get_page(url.replace("
    ", ""))
            items = re.findall('<meta name="description" content="(.*?)">', page, re.S)
            print(items)
            if len(items) > 0:
                hot = str(linecache.getline("final_hotword2.txt", i + 1).strip())
                hotexplent = str(items[0])
                link = str(url.replace("
    ", ""))
                sheet01.write(i + 1, 0, hot)
                sheet01.write(i + 1, 1, hotexplent)
                sheet01.write(i + 1, 2, link)
                value = (hot, hotexplent, link)
                alllist.append(value)
                i += 1
            print("总爬取完毕数量:" + str(i))
        print("打印完!!!")
        print(alllist)
        tuplist = tuple(alllist)
        # 存到mysql
        db = pymysql.connect(host="localhost", user="root", password="1229", database="lianxi", charset='utf8')
        cursor = db.cursor()
        sql_cvpr = "INSERT INTO website values(%s,%s,%s)"
        try:
            cursor.executemany(sql_cvpr, tuplist)
            db.commit()
        except:
            print('执行失败,进入回调3')
            db.rollback()
        db.close()
        f.save('hotword_explain.xls')
    def getDetail(href, title, line,hrefs,titles,contents,dbinsert):
        line1 = line.replace('
    ', '')
        # print(title)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
            'Cookie': '__gads=ID=d75a34306778b5ff:T=1618733672:S=ALNI_MaHsp16fM1BsstZSOfJwX6G_QAUWQ; UM_distinctid=179ea734280ecc-09762ea5a082b4-f7f1939-144000-179ea7342813ab; .CNBlogsCookie=5E1AE0B6F75346AAE5B35A668F42048EA98E21F1955AD2276AF31BF0A80E27F66707B9E60CCEAD47488A30992D00DC561A9CABC8F44787B3C02A7BE680DD9E3674007F8365B87C23956D9A9DE039EFBF84AF98F6; _ga_3Q0DVSGN10=GS1.1.1627456725.2.1.1627456780.0; _ga=GA1.2.1309006929.1616077024; .Cnblogs.AspNetCore.Cookies=CfDJ8NACB8VE9qlHm6Ujjqxvg5CnxAgBVZuTnvm6dxCDTSbTrA48gOuFsKr59bSiLaPWs0F6RvFHxDGBPyp0eJ37eNmqXA-o3aBDnqb0SWn9WseCNwJFkDPi8YBkpNVfsXkXN759CT9mktdmE2mDvN2cmdIT5Hus-g0h8jvvAcB4Rv0u70At2vuraQEHkssYBYgPCGzYMr4ewXqP7W4hDt0J67noxu44HbVMveZtSzdh8pxXwlJ8i1pVA7VX4-gBtgdmc2POqQ2DoA6en5Jq-ne5-hyclgJ7EdobG5wPNt6A6ByteR6FIxpZNBLYRN6OCFjWCXrF7hdxLnTmSVQ02cYC53Q6V-658PcTbW_mwMu0pTOTbAFWh1kE25e6GUagtwqZq2mVlbuiYhiTOx2y2NAcdAebkgM75EVIEp6xTjt2xrMLj7A_cbNoQ6SM0n9DNNNDXM17frFZeUQhgJQCHGX_MD0sc_p-MyTcb-lFJl3Ddk8S6M6213_NnF9fOFwCYp-LO9FUxKSLAoWVttIdnIzeF6gHj6WgjIuUolAxAYjoLvPawhFxJpfRuCCWuhj6OlU6L25UvGVHphyiJ7EZLAscBdg; _gid=GA1.2.3940361.1628580647; affinity=1628668232.326.326.71855'}
        url2 = "https://news.cnblogs.com" + href
        requests.adapters.DEFAULT_RETRIES = 10
        r2 = requests.get(url2, headers=headers)
        html = r2.content.decode("utf-8")
        html1 = etree.HTML(html)
        content1 = html1.xpath('//div[@id="news_body"]')
        # print('line:'+line)
        if len(content1) == 0:
            print("异常")
        else:
            titles.append(title)
            hrefs.append(url2)
            content2 = content1[0].xpath('string(.)')
            # print(content2)
            content = content2.replace('
    ', '').replace('	', '').replace('
    ', '').replace(' ', '')
            contents.append(content)
            # print(title)
            # print(content)
            # print(line)
            m = content.find(line1)
            n = title.find(line1)
            # print(line1)
            # print(m)
            # print(n)
            # python中是没有&&及||这两个运算符的,取而代之的是英文and和or
            if m != -1 or n != -1:
                print('匹配上')
                value=[title,url2,line1]
                dbinsert.append(value)
            else:
                print('未匹配')
    
    
    def climing(line,hrefs,titles,contents,dbinsert):
        print(line);
        for i in range(0, 40):
            print("***********************************")
            print(i)
            page = i + 1
            url = "https://news.cnblogs.com/n/page/" + str(page)
            r = requests.get(url)
            html = r.content.decode("utf-8")
            # print("Status code:", r.status_code)
            # print(html)
            html1 = etree.HTML(html)
            href = html1.xpath('//h2[@class="news_entry"]/a/@href')
            title = html1.xpath('//h2[@class="news_entry"]/a/text()')
            # print(href)
            # print(title)
            for a in range(0, 18):
                getDetail(href[a], title[a], line,hrefs,titles,contents,dbinsert)
    
    def bijiao(hrefs,titles,contents,line,dbinsert):
        print(line)
        line1 = line.replace('
    ', '')
        for i in range(0,len(titles)):
            print(i)
            content = str(contents[i])
            title = str(titles[i])
            m = content.find(line1)
            n = title.find(line1)
            if m != -1 or n != -1:
                print('匹配上')
                value = [title, hrefs[i], line1]
                dbinsert.append(value)
            else:
                print('未匹配')
    def getHotLink():
        # 文件读取,读取到热词
        hrefs = []
        titles = []
        contents = []
    
        p = 1
        for line in open("final_hotword2.txt", encoding='utf-8'):
            dbinsert=[]
            if p == 1:
                climing(line,hrefs,titles,contents,dbinsert)
            else:
                bijiao(hrefs,titles,contents,line,dbinsert)
            p = p+1
            tulinsert=tuple(dbinsert)
            db = pymysql.connect(host="localhost", user="root", password="1229", database="lianxi", charset='utf8')
            cursor = db.cursor()
            sql_xilang = "INSERT INTO Link values(%s,%s,%s)"
            try:
                cursor.executemany(sql_xilang, tulinsert)
                db.commit()
            except:
                print('执行失败,进入回调3')
                db.rollback()
            db.close()
    if __name__ == '__main__':
        #获取新闻
        getHotword()
        #进行分词,获取热词
        fenci()
        #百度词条获取热词解释
        getHotExpeain()
        #获取与热词相关的链接
        getHotLink()

    其中final.txt,是个词包,我已经将这个词包传到百度网盘上了。

    链接:https://pan.baidu.com/s/1zRTS5lJlEAN_NuHljXQY7Q
    提取码:h4b6

    然后是数据库的表:

    一共三个表:

    final_hotword:

     website:

    Link:

     片段整合到一起运行,会爬取最新新闻存储起来,并会通过jieba自动分词,生成100个热词存储到数据库,会爬取热词解释到数据库,最后会爬取热词相关文章链接到数据库。

    作者:哦心有
    本文版权归作者和博客园共有,欢迎转载,但必须给出原文链接,并保留此段声明,否则保留追究法律责任的权利。
  • 相关阅读:
    How to produce the first draft of research paper?
    ETL
    BDA chapter 10
    <转>Java转iOS-第一个项目总结(2):遇到问题和解决方案
    <转>从Java转iOS第一个项目总结
    (转)总结iOS 8和Xcode 6的各种坑
    iOS开发之Xcode6之后不再自动创建Pch预编译文件,该如何解决这个问题?
    iOS开发:Objective-C中通知与协议的区别?
    PT和PX是什么鬼?
    使用cocoaPods经常出现的问题以及解决方案
  • 原文地址:https://www.cnblogs.com/haobox/p/15142298.html
Copyright © 2011-2022 走看看