zoukankan      html  css  js  c++  java
  • 1103-词牌名,合称,诗词形式

    词牌名收集

    原网页形式

     数据收集

    import requests
    from bs4 import BeautifulSoup
    from lxml import etree
    
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
    cipai=[]
    
    for i in range(1,7):
        url='https://www.xungushici.com/cipais/p'+str(i)
        r=requests.get(url,headers=headers)
        content=r.content.decode('utf-8')
        soup = BeautifulSoup(content, 'html.parser')
    
        hed=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100')
        list=hed.find_all('li',class_="m-1 badge badge-light")
    
        for it in list:
            if it.a!=None:
                cipai.append(it.a.text)
    
    import xlwt
    
    xl = xlwt.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    
    sheet1.write(0,0,"title")
    for i in range(0,len(cipai)):
        sheet1.write(i+1,0,cipai[i])
    
    xl.save("cipai_name.xlsx")

    存储形式

    诗人合称

    原数据网页

     数据收集

    import requests
    from bs4 import BeautifulSoup
    from lxml import etree
    
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
    
    hc=[]
    
    url='https://www.xungushici.com/authors'
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')
    orign_href='https://www.xungushici.com'
    
    
    hecheng=soup.find('div',id='divHeCheng')
    list=hecheng.find_all('li',class_="m-1 badge badge-light")
    dic={}
    for i in range(1,len(list)):
        href=orign_href+list[i].a['href']
        hecehng=list[i].a.text
        hc.append(hecehng)
        r2 = requests.get(href, headers=headers)
        content2 = r2.content.decode('utf-8')
        soup2 = BeautifulSoup(content2, 'html.parser')
        pomdiv=soup2.find('div',class_='col col-sm-12 col-lg-9')
        card=pomdiv.find_all('div',class_='card mt-3')
        author_list=[]
        for it in card:
            h4=it.find('h4',class_='card-title')
            list_a=h4.find_all('a')
            desty=list_a[0].text
            author=list_a[1].text
            author_list.append(author)
        dic[hecehng]=",".join(author_list)
    
    import xlwt
    
    xl = xlwt.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    
    sheet1.write(0,0,"hc")
    sheet1.write(0,1,'author')
    for i in range(0,len(hc)):
        sheet1.write(i+1,0,hc[i])
        sheet1.write(i+1,1,dic[hc[i]])
    
    xl.save("common_name.xlsx")
    
    
    for it in hc:
        print(it+": "+dic[it])

    存储形式

    之后将读取该表,对应到诗人表中添加一列合称属性

    诗词形式

    形式分类

    按照一句话中诗词的个数分为:五言,七言

    按照诗词的句子,每首四局为绝句,每首八句为律诗。绝句分为:五言绝句和七言绝句;律诗分为:五言律诗和七言律诗

    数据处理

    新学到一个表格追加使用技巧:

    from xlrd import open_workbook
    from xlutils.copy import copy
    #将分类结果重新写入原excel中
    def write_to(data,file):
        print(len(data))
        xl =open_workbook(file)
        excel = copy(xl)
        sheet1 = excel.get_sheet(0)
    
        sheet1.write(0, 8, "formal")
        for i in range(0, len(data)):
            sheet1.write(i + 1, 8, data[i])
    
        excel.save(file)

    数据处理源码

    import xlwt
    import pandas as pd
    
    #读取源数据,获取诗词内容
    def read_excel(file):
        data=pd.read_excel(file)
        content=data.content
        return content
    
    #诗词形式获取
    def formal(content):
        formal_list=[]
        for it in content:
            ju_list=str(it).replace('
    ','').replace('.','').split('')
            print(ju_list)
            if (len(ju_list)-1==8):
                if len(ju_list[0])==11:
                    formal_list.append("五言律诗")
                    print("五言律诗")
                elif len(ju_list[0])==15:
                    formal_list.append("七言律诗")
                    print("七言律诗")
                else:
                    formal_list.append("")
                    print("")
            elif len(ju_list)-1==4:
                if len(ju_list[0])==11:
                    formal_list.append("五言绝句")
                    print("五言绝句")
                elif len(ju_list[0])==15:
                    formal_list.append("七言绝句")
                    print("七言绝句")
                else:
                    formal_list.append("")
                    print("")
            else:
                if len(ju_list[0])==11:
                    formal_list.append("五言")
                    print("五言")
                elif len(ju_list[0]) == 15:
                    formal_list.append("七言")
                    print("七言")
                else:
                    formal_list.append("")
                    print("")
        return formal_list
    
    from xlrd import open_workbook
    from xlutils.copy import copy
    #将分类结果重新写入原excel中
    def write_to(data,file):
        print(len(data))
        xl =open_workbook(file)
        excel = copy(xl)
        sheet1 = excel.get_sheet(0)
    
        sheet1.write(0, 8, "formal")
        for i in range(0, len(data)):
            sheet1.write(i + 1, 8, data[i])
    
        excel.save(file)
    
    #获取指定文件夹下的excel
    import os
    def get_filename(path,filetype):  # 输入路径、文件类型例如'.xlsx'
        name = []
        for root,dirs,files in os.walk(path):
            for i in files:
                if os.path.splitext(i)[1]==filetype:
                    name.append(i)
        return name            # 输出由有后缀的文件名组成的列表
    
    
    if __name__ == '__main__':
        #获取指定文件夹下的源数据
        file='data/'
        list=get_filename(file,'.xlsx')
        for it in list:
            newfile=file+it
            #获取诗词内容
            data=read_excel(newfile)
            #根据诗词内容,获取对应的诗词形式
            formal_data=formal(data)
            #将诗词形式重新写入源数据
            write_to(formal_data,newfile)

    结果展示

    明天任务

    1.曲牌名筛选出

    2.飞花令爬取

    3.找出诗句对应的“飞花令”

    4.中文分词,试图将诗人个人经历,逐个分段,梳理出这几类关键信息:人物,时间,事件,地点。将文本抽取为规则化的数据格式

     

  • 相关阅读:
    [转]解决ORACEL数据库“exp导出老是出现ORA-00904”
    教程
    [转]web初学者需要掌握哪些技术
    sublime的使用技巧
    安装Sublime Text
    memcached内存分类机制
    Java生产者和消费者
    一致性哈希原理及应用浅析
    Java线程状态分析
    Java多线程中断机制
  • 原文地址:https://www.cnblogs.com/xiaofengzai/p/15506017.html
Copyright © 2011-2022 走看看