zoukankan      html  css  js  c++  java
  • 1117古今地名映射爬取与经纬度检索&诗人,牌名,朝代,飞花令词实体导入

    古今地名映射

    爬取来源

    从百度百科调用它的搜索接口:检索两个内容,一个是它的现地名,另一个是它的简介,从简介中在进行词性分析找出对应的地名

     代码

    import urllib.request
    import urllib.parse
    from lxml import etree
    from pyhanlp import *
    import pandas as pd
    
    def query(content):
        # 请求地址
        url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content)
        print(url)
        # 请求头部
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        # 利用请求地址和请求头部构造请求对象
        req = urllib.request.Request(url=url, headers=headers, method='GET')
        # 发送请求,获得响应
        response = urllib.request.urlopen(req)
        # 读取响应,获得文本
        text = response.read().decode('utf-8')
        # 构造 _Element 对象
        html = etree.HTML(text)
        # 使用 xpath 匹配数据,得到匹配字符串列表
        #'/html/body/div[3]/div[2]/div/div[1]/div[7]/dl[2]/dd[5]/a'
        #sen_list = html.xpath('//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()')
        f=False
        sen_list=html.xpath('/html/body/div[3]/div[2]/div/div[1]/dl[1]/dd/h2//text()')
        if sen_list==[]:
            sen_list = html.xpath(
                '//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()')
        if sen_list!=[]:
            # 过滤数据,去掉空白
            sen_list_after_filter = [item.strip('\n') for item in sen_list]
            # 将字符串列表连成字符串并返回
            text=''.join(sen_list_after_filter)
            CRFnewSegment = HanLP.newSegment("crf")
            term_list = CRFnewSegment.seg(text)
            ci=['ns']
            where_list=[]
            for it in term_list:
                if str(it.nature) in ci:
                    where_list.append(str(it.word))
            if len(where_list)>0:
                print(where_list)
                return where_list[0]
            else:
                return ""
        else:
            return ""
    
    
    from xlrd import open_workbook
    from xlutils.copy import copy
    
    #将分类结果重新写入原excel中
    def write_to(data,file):
        print(len(data))
        xl =open_workbook(file)
        excel = copy(xl)
        sheet1 = excel.get_sheet(0)
    
        sheet1.write(0, 1, "jin_where")
        for i in range(0, len(data)):
            sheet1.write(i + 1, 1, data[i])
    
        excel.save(file)
    
    if __name__ == '__main__':
        jin_list=[]
        data=pd.read_excel('gu_where.xlsx')
        gu_where=data.gu_where
        for i in range(len(gu_where)):
            content=gu_where[i]
            print(content)
            result = query(content)
            print("查询结果:%s" % result)
            jin_list.append(result)
        write_to(jin_list,'gu_where.xlsx')

    结果

     现今地点经纬度

    首先对古代地点进行经纬度获取,获取后保存获得的现金地名

    若古代地名获取不到经纬度,用现今地名进行获取经纬度,同样保存获得的现金地名和经纬度

    在进行高德地图经纬度调用的时候要注意一次不能太多:500个地名经纬度能容忍(别问我是如何知道的!!!惨痛的实践)

    import pandas as pd
    import requests
    import json
    def coords(city):
        # 输入API问号前固定不变的部分
        url = 'https://restapi.amap.com/v3/geocode/geo'
    
        # 将两个参数放入字典
        params = {'key': 'cd0c1ab60e3a22a87009a4196abd94e0',
                  'address': city}
        res = requests.get(url, params)
        jd = json.loads(res.text)
        if len(jd['geocodes']) != 0:
            print(jd)
            coords = jd['geocodes'][0]['location']
            address=jd['geocodes'][0]['formatted_address']
            print(address)
            return coords,address
        else:
            return '',''
    
    if __name__ == '__main__':
        data=pd.read_excel('gu_where.xlsx')
        gu_name=list(data.gu_where)
        jin_name=list(data.jin_where)
        ans_gu=[]
        ans_jin=[]
        #经度与纬度
        lng=[]
        lat=[]
        for i in  range(6500,len(gu_name)):
            gu=gu_name[i]
            jin=jin_name[i]
            loca, address = coords(gu)
            if loca != '':
                ans_gu.append(gu)
                ans_jin.append(address)
                loca_list=loca.split(',')
                lng.append(loca_list[0])
                lat.append(loca_list[1])
                print(gu+" "+address+" "+str(loca_list[0])+" "+str(loca_list[1]))
            else:
                loca,address=coords(jin)
                if loca!='':
                    ans_gu.append(gu)
                    ans_jin.append(address)
                    loca_list = loca.split(',')
                    lng.append(loca_list[0])
                    lat.append(loca_list[1])
                    print(gu+" "+address+" "+str(loca_list[0])+" "+str(loca_list[1]))
        import xlwt
    
        xl = xlwt.Workbook()
        # 调用对象的add_sheet方法
        sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    
        sheet1.write(0, 0, "gu_name")
        sheet1.write(0,1,"jin_name")
        sheet1.write(0,2,"lng")
        sheet1.write(0,3,"lat")
        for i in range(0, len(ans_jin)):
            sheet1.write(i + 1, 0, ans_gu[i])
            sheet1.write(i + 1, 1, ans_jin[i])
            sheet1.write(i + 1, 2, lng[i])
            sheet1.write(i + 1, 3, lat[i])
    
        xl.save("gu_jin_lng_lat2.xlsx")

    清洗后数据

    在获得经纬度的地名进行相应的保存

     部分实体导入

    诗人与朝代实体

    import pandas as pd
    import numpy as np
    import re
    from py2neo import Node,Relationship,Graph,NodeMatcher,RelationshipMatcher
    
    # 创建节点
    def CreateNode(m_graph,m_label,m_attrs):
        m_n="_.name="+"\'"+m_attrs['name']+"\'"
        matcher = NodeMatcher(m_graph)
        re_value = matcher.match(m_label).where(m_n).first()
        #print(re_value)
        if re_value is None:
            m_mode = Node(m_label,**m_attrs)
            n = graph.create(m_mode)
            return n
        return None
    # 查询节点
    def MatchNode(m_graph,m_label,m_attrs):
        m_n="_.name="+"\'"+m_attrs['name']+"\'"
        matcher = NodeMatcher(m_graph)
        re_value = matcher.match(m_label).where(m_n).first()
        return re_value
    # 创建关系
    def CreateRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph,m_label1,m_attrs1)
        reValue2 = MatchNode(m_graph,m_label2,m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        m_r = Relationship(reValue1,m_r_name,reValue2)
        n = graph.create(m_r)
        return n
    
    #查找关系
    def findRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        m_r = Relationship(reValue1, m_r_name['name'], reValue2)
        return m_r
    
    def updateRelation(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        print(m_r_name)
        propertyes={'value': m_r_name['value'], 'danwei': m_r_name['danwei']}
        m_r = Relationship(reValue1, m_r_name['name'], reValue2,**propertyes)
        graph.merge(m_r)
    
    #修改节点属性
    def updateNode(m_graph,m_label1,m_attrs1,new_attrs):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        if reValue1 is None:
            return False
        reValue1.update(new_attrs)
        graph.push(reValue1)
    
    
    
    graph = Graph('http://localhost:7474',username='neo4j',password='fengge666')
    
    
    def create_author():
        file='./data2/author.xlsx'
        data=pd.read_excel(file).fillna("")
        author=list(data.author)
        produce=list(data.produce)
        num=list(data.num)
        src=list(data.src)
        desty=list(data.desty)
        bg_time=list(data.begin_time)
        ed_time=list(data.end_time)
        zi_list=list(data.zi)
        hao_list=list(data.hao)
        author_label='author'
        desty_label='desty'
        for i in range(len(author)):
            print(""+str(i)+"")
            attr1 = {"name": author[i], "produce": produce[i], "num": num[i],
                     "src": src[i],"bg_time":bg_time[i],"ed_time":ed_time[i],"zi":zi_list[i],"hao":hao_list[i]}
            CreateNode(graph, author_label, attr1)
            print("创建诗人:" + author[i] + "成功!!")
            attr2={"name":desty[i]}
            if MatchNode(graph,desty_label,attr2)==None:
                CreateNode(graph,desty_label,attr2)
                print("创建朝代:"+desty[i]+"成功!!")
            #创建关系
            m_r_name1 = "朝代"
            reValue1 = CreateRelationship(graph, author_label, attr1, desty_label, attr2, m_r_name1)
            print("创建关系:"+author[i]+"-所属朝代-"+desty[i]+"成功")
            m_r_name2 = "包含"
            reValue2 = CreateRelationship(graph,desty_label, attr2, author_label, attr1,  m_r_name2)
            print("创建关系:" + desty[i] + "-包含-" + author[i] + "成功")
    
    
    
    if __name__ == '__main__':
        create_author()

    导入效果

     牌名

    包含词牌名,曲牌名

    import pandas as pd
    import numpy as np
    import re
    from py2neo import Node,Relationship,Graph,NodeMatcher,RelationshipMatcher
    
    # 创建节点
    def CreateNode(m_graph,m_label,m_attrs):
        m_n="_.name="+"\'"+m_attrs['name']+"\'"
        matcher = NodeMatcher(m_graph)
        re_value = matcher.match(m_label).where(m_n).first()
        #print(re_value)
        if re_value is None:
            m_mode = Node(m_label,**m_attrs)
            n = graph.create(m_mode)
            return n
        return None
    # 查询节点
    def MatchNode(m_graph,m_label,m_attrs):
        m_n="_.name="+"\'"+m_attrs['name']+"\'"
        matcher = NodeMatcher(m_graph)
        re_value = matcher.match(m_label).where(m_n).first()
        return re_value
    # 创建关系
    def CreateRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph,m_label1,m_attrs1)
        reValue2 = MatchNode(m_graph,m_label2,m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        m_r = Relationship(reValue1,m_r_name,reValue2)
        n = graph.create(m_r)
        return n
    
    #查找关系
    def findRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        m_r = Relationship(reValue1, m_r_name['name'], reValue2)
        return m_r
    
    def updateRelation(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        print(m_r_name)
        propertyes={'value': m_r_name['value'], 'danwei': m_r_name['danwei']}
        m_r = Relationship(reValue1, m_r_name['name'], reValue2,**propertyes)
        graph.merge(m_r)
    
    #修改节点属性
    def updateNode(m_graph,m_label1,m_attrs1,new_attrs):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        if reValue1 is None:
            return False
        reValue1.update(new_attrs)
        graph.push(reValue1)
    
    
    
    graph = Graph('http://localhost:7474',username='neo4j',password='fengge666')
    
    def create_pai_name():
        file = './data2/cipai_name.xlsx'
        data = pd.read_excel(file).fillna("")
        title=list(data.title)
        cipai_label="ci_pai"
        for it in title:
            attr1={"name":it}
            CreateNode(graph, cipai_label, attr1)
            print("创建词牌名"+it+"成功!!")
    
        file2 = './data2/qupai_name.xlsx'
        data2 = pd.read_excel(file2).fillna("")
        title2 = list(data2.qu_name)
        qupai_label = "qu_pai"
        for it in title2:
            attr1 = {"name": it}
            CreateNode(graph, qupai_label, attr1)
            print("创建曲牌名" + it + "成功!!")
    
    
    
    if __name__ == '__main__':
        create_pai_name()

    导入效果

     曲牌名:

     

    飞花令

    import pandas as pd
    import numpy as np
    import re
    from py2neo import Node,Relationship,Graph,NodeMatcher,RelationshipMatcher
    
    # 创建节点
    def CreateNode(m_graph,m_label,m_attrs):
        m_n="_.name="+"\'"+m_attrs['name']+"\'"
        matcher = NodeMatcher(m_graph)
        re_value = matcher.match(m_label).where(m_n).first()
        #print(re_value)
        if re_value is None:
            m_mode = Node(m_label,**m_attrs)
            n = graph.create(m_mode)
            return n
        return None
    # 查询节点
    def MatchNode(m_graph,m_label,m_attrs):
        m_n="_.name="+"\'"+m_attrs['name']+"\'"
        matcher = NodeMatcher(m_graph)
        re_value = matcher.match(m_label).where(m_n).first()
        return re_value
    # 创建关系
    def CreateRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph,m_label1,m_attrs1)
        reValue2 = MatchNode(m_graph,m_label2,m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        m_r = Relationship(reValue1,m_r_name,reValue2)
        n = graph.create(m_r)
        return n
    
    #查找关系
    def findRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        m_r = Relationship(reValue1, m_r_name['name'], reValue2)
        return m_r
    
    def updateRelation(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
        if reValue1 is None or reValue2 is None:
            return False
        print(m_r_name)
        propertyes={'value': m_r_name['value'], 'danwei': m_r_name['danwei']}
        m_r = Relationship(reValue1, m_r_name['name'], reValue2,**propertyes)
        graph.merge(m_r)
    
    #修改节点属性
    def updateNode(m_graph,m_label1,m_attrs1,new_attrs):
        reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
        if reValue1 is None:
            return False
        reValue1.update(new_attrs)
        graph.push(reValue1)
    
    
    
    graph = Graph('http://localhost:7474',username='neo4j',password='fengge666')
    
    def create_word():
        file = './data2/word.xlsx'
        data = pd.read_excel(file).fillna("")
        word=list(data.word)
        word_label="word"
        for it in word:
            attr1={"name":it}
            CreateNode(graph, word_label, attr1)
            print("创建飞花令:"+it+"成功!!")
    
    
    
    if __name__ == '__main__':
        create_word()

    导入效果

     

  • 相关阅读:
    使用jquery插件操作哈希表Hashtable的应用
    学用 ASP.Net 之 "字符串" (1): 基础
    敬告
    jQuery 练习[一]: 准备工作
    学用 ASP.Net 之 "字符串" (2): string.Format
    jQuery 练习[二]: 获取对象(1) 基本选择与层级
    学用 ASP.Net 之 "字符串" (3): string 类的非扩展方法
    jQuery 练习[二]: 获取对象(2) 定位子对象
    一个分割文本文件的小程序 回复 "jellyang" 的问题
    Delphi 7 类库继承关系表
  • 原文地址:https://www.cnblogs.com/xiaofengzai/p/15569940.html
Copyright © 2011-2022 走看看