zoukankan      html  css  js  c++  java
  • 1031-诗人信息重新收集及诗人关联人物

    诗人信息重新收集

    收集所有诗人

    诗人图片与朝代,之前收集的诗人只有2000多个,是因为之前收集的诗人有生平,而其他诗人没有,故没有收集到

    这次收集所有诗人,都有个人介绍。从个人介绍中也可提取出有用的信息。

    经过此次收集,共收集到20088个诗人

    import requests
    from bs4 import BeautifulSoup
    from lxml import etree
    import re
    
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
    pom_list=[]
    k=1
    for i in range(1,2010):
        url='https://www.xungushici.com/authors/p-'+str(i)
        r=requests.get(url,headers=headers)
        content=r.content.decode('utf-8')
        soup = BeautifulSoup(content, 'html.parser')
    
        hed=soup.find('div',class_='col col-sm-12 col-lg-9')
        list=hed.find_all('div',class_="card mt-3")
    
        origin_url='https://www.xungushici.com'
    
        for it in list:
            content = {}
            # 1.1获取单页所有诗集
            title = it.find('h4', class_='card-title')
            poemauthor=title.find_all('a')[1].text
            desty=title.find_all('a')[0].text
            #print(poemauthor+"+"+desty)
            #获取诗人图像
            if it.find('a',class_='ml-2 d-none d-md-block')!=None:
                src=it.find('a',class_='ml-2 d-none d-md-block').img['src']
            else:
                src="http://www.huihua8.com/uploads/allimg/20190802kkk01/1531722472-EPucovIBNQ.jpg"
            content['src']=src
            href=title.find_all('a')[1]['href']
            #对应的诗人个人详情页面
            real_href = origin_url + href
            #诗人简介及诗集个数
            text = it.find('p', class_='card-text').text
            #诗人的诗集个数
            numtext = it.find('p', class_='card-text').a.text
            pattern = re.compile(r'd+')
            num=re.findall(pattern, numtext)[0]
    
            #进入诗人详情页面
            r2=requests.get(real_href,headers=headers)
            content2=r2.content.decode('utf-8')
            soup2 = BeautifulSoup(content2, 'html.parser')
            ul=soup2.find('ul',class_='nav nav-tabs bg-primary')
            if ul!=None:
                list_li=ul.find_all('li',class_='nav-item')
                exp = ""
                for it in list_li:
                    if it.a.text=="人物生平" or it.a.text=="人物" or it.a.text=="生平":
                        urlsp=origin_url+it.a['href']
                        r3 = requests.get(urlsp, headers=headers)
                        content3 = r3.content.decode('utf-8')
                        soup3 = BeautifulSoup(content3, 'html.parser')
                        list_p=soup3.select('body > div.container > div > div.col.col-sm-12.col-lg-9 > div:nth-child(3) > div.card > div')
                        for it in list_p:
                            exp=it.get_text().replace('
    ','').replace('	','').replace('
    ','')
                content['author']=poemauthor
                content['produce']=text
                content['num']=num
                content['desty'] = desty
                content['experience'] = exp
                pom_list.append(content)
            else:
                content['author'] = poemauthor
                content['produce'] = text
                content['num'] = num
                content['desty'] = desty
                content['experience'] = ""
                pom_list.append(content)
            print(""+str(k)+"")
            k=k+1
    
    import xlwt
    
    xl = xlwt.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    
    sheet1.write(0,0,"author")
    sheet1.write(0,1,'produce')
    sheet1.write(0,2,'num')
    sheet1.write(0,3,'experience')
    sheet1.write(0,4,'src')
    sheet1.write(0,5,'desty')
    
    for i in range(0,len(pom_list)):
        sheet1.write(i+1,0,pom_list[i]['author'])
        sheet1.write(i+1, 1, pom_list[i]['produce'])
        sheet1.write(i+1, 2, pom_list[i]['num'])
        sheet1.write(i+1, 3, pom_list[i]['experience'])
        sheet1.write(i + 1, 4, pom_list[i]['src'])
        sheet1.write(i + 1, 5, pom_list[i]['desty'])
    xl.save("author3.xlsx")

    存储效果

     

     诗人关联人物

    通过诗集来找到关联人物

    先找唐代相互关联的诗人,步骤如下:

    通过对诗名,诗文赏析,写作背景,来对作者(也就是诗人)进行分析,找出其中包含的其他诗人的名字。存储到一个列表

    因为会涉及到多首诗,都是同一个作者,因此,涉及到字典,列表合并,去重

    列表合并:list+=list2

    列表去重:a=list(set(a))

    在考虑是否是相关的作者时,有以下考察点:

    其一:通过内容去检索所有诗人,找到诗人库中包含的关联诗人

    其二:找到关联诗人后,判断是否和作者同一个朝代

    其三:还可判断是否在作者的出生年月内(未实现,部分诗人出生年月不详)

    import numpy
    import xlwt
    import pandas as pd
    
    ans=[]
    
    def read_author():
        file="author3.xlsx"
        data=pd.read_excel(file).fillna("")
        author=list(data.author)
        all_desty=list(data.desty)
        experience=list(data.experience)
    
        dict_exp={}
        dict_des={}
    
        for i in range(len(author)):
            dict_exp[author[i]]=experience[i]
            dict_des[author[i]]=all_desty[i]
    
    
        #去重
        author=list(set(author))
        file2="tang.xlsx"
        tang=pd.read_excel(file2).fillna("")
        author_name=list(tang.author)
        t_desty=list(tang.desty)
        new_author=list(set(author_name))
    
        dict={}
        for it in new_author:
            dict[it]=[]
    
        title=list(tang.title)
        appear=list(tang.appear)
        back=list(tang.background)
        for i in range(len(author_name)):
            name = author_name[i]
            if name=='佚名':
                continue
            content=title[i]+appear[i]+back[i]
    
            friend=[]
            for j in range(len(author)):
                if content.find(author[j])!=-1 and dict_des[author[j]]=="唐代":
                    friend.append(author[j])
            #print(name+": "+str(friend))
            dict[name]+=friend
            dict[name]=list(set(dict[name]))
            dict[name]=dict[name]
        print("--------------------ans--------------------------")
        ans_author=[]
        ans_friend=[]
        for it in dict:
            if dict[it]!=[]:
                ans_author.append(it)
                ans_friend.append(','.join(dict[it]))
                print(it+":"+str(dict[it]))
    
    
    
        xl = xlwt.Workbook()
        # 调用对象的add_sheet方法
        sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    
        sheet1.write(0, 0, "author")
        sheet1.write(0, 1, 'friend')
    
    
        for i in range(0, len(ans_author)):
            sheet1.write(i + 1, 0, ans_author[i])
            sheet1.write(i + 1, 1, ans_friend[i])
    
        xl.save("friend.xlsx")
    
    
    
    
    
    
    if __name__ == '__main__':
        read_author()

    效果

    里面friend有的包括原作者,之后在对其清洗

     

  • 相关阅读:
    数组地址,数组首地址与数组首元素地址的区别
    memset,memcpy与strcpy
    OJ之大数与高精度题必备知识
    OJ之星期几算法(泽勒一致性)
    二分查找及其优化
    爱上vim之快捷键使用技巧与个性化配置
    shell之终极shell——zsh
    memset的一些坑
    OO终章
    hOmewOrk 第三单元 总结
  • 原文地址:https://www.cnblogs.com/xiaofengzai/p/15490155.html
Copyright © 2011-2022 走看看