今天完成了对行业和行业单位进行匹配 以及可视化,主要用到了
difflib 来进行对数据的操作
import pymysql import difflib def get_equal(str1, str2): data=difflib.SequenceMatcher(None, str1, str2).quick_ratio() return data def mysql(b,code,id): db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') # 使用cursor()方法获取操作游标 cur = db.cursor() sql = "update keji3_copy1 set hangye = '%s',hangyedaima= %s where id = %s" % (b, code, id) cur.execute(sql) # 像sql语句传递参数 # 提交 db.commit() def name (hynane,id): db1 = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') cursor = db1.cursor() sql = "SELECT name,daima FROM data" cursor.execute(sql) datalist = [] codelist = [] alldata = cursor.fetchall() for s in alldata: datalist.append(s[0]) codelist.append(s[1]) #print(datalist) a=0 b='' j=0 code='' for i in datalist: j=j+1 list=get_equal(i,hynane) if(list>a): a=list b=i code=codelist[j] print(b,code,id) return b,code,id db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') cursor = db.cursor() sql1 = "SELECT jianjie,id FROM keji3_copy1 where name is not null and hangye is not null and industry is not null" cursor.execute(sql1) alldata = cursor.fetchall() for s in alldata: hynane=s[0] id=s[1] print(hynane,id) data,data1,data2=name(hynane,id) mysql(data,data1,data2)
之前的关键词匹配
用到了jieba库
# encoding: utf-8 import jieba import jieba.posseg as pseg #词性标注 import jieba.analyse as anls #关键词提取 import pymysql def guanjian(s): y='' for x, w in anls.extract_tags(s, topK=6,withWeight=True): y=y+' '+x return y def mysql(): # 使用这个函数例如传入一个地址 网安大厦,当然传入的地址也不能太模糊了 # 打开数据库连接 db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') db1 = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() cursor1 = db1.cursor() # SQL 查询语句 sql = "SELECT * FROM keji3 where guanjianci is null" try: # 正确 # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 # db.commit() 这是在增删改的情况下 下这条语句是提交到数据库 results = cursor.fetchall() i = 0 for row in results: jianjie = row[10].strip(' ') guanjianci=row[9] i+=1 id=row[0] guan=guanjian(jianjie) try: sql1 = "UPDATE keji3 SET guanjianci ='%s' where id = %s" % (guan, id) cursor1.execute(sql1) # 提交到数据库执行 db1.commit() except: print("2") # 打印结果 print(guan, id) except: # 发生错误 print("1") # 关闭数据库连接 db.close() mysql()
规范化地域和行政代码用了百度地图接口
import requests import json import pymysql ''' 函数功能:根据传入的比较模糊的地址获取精确的结构化地址 函数实现流程:利用百度地图api的正地理编码可以获取该位置的经纬度 接着使用经纬度采用逆地址编码获取结构化地址 ''' def get_formatted_address(address): # 根据百度地图api接口获取正地址编码也就是经纬度 url1 = 'http://api.map.baidu.com/geocoding/v3/?address=' + address + '&output=json&ak=LXkcInigDnERPxfjbGZH2hKm7kr0uO5U&callback=showLocation' # 获取经纬度 resp1 = requests.get(url1) resp1_str = resp1.text resp1_str = resp1_str.replace('showLocation&&showLocation', '') resp1_str = resp1_str[1:-1] resp1_json = json.loads(resp1_str) location = resp1_json.get('result').get('location') # 根据经纬度获取结构化地址 lng = location.get('lng') lat = location.get('lat') url2 = 'http://api.map.baidu.com/reverse_geocoding/v3/?ak=LXkcInigDnERPxfjbGZH2hKm7kr0uO5U&output=json&coordtype=wgs84ll&location=' + str( lat) + ',' + str(lng) + '' resp2 = requests.get(url2) resp2_json = json.loads(resp2.text) # 提取结构化地址 province = resp2_json.get('result').get('addressComponent').get('province') city=resp2_json.get('result').get('addressComponent').get('city') district = resp2_json.get('result').get('addressComponent').get('district') if(province==city): address=city+district if(province!=city): address=province+city+district gj = resp2_json.get('result').get('addressComponent').get('adcode') return address,gj def mysql(): # 使用这个函数例如传入一个地址 网安大厦,当然传入的地址也不能太模糊了 # 打开数据库连接 db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') db1 = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() cursor1 = db1.cursor() # SQL 查询语句 sql = "SELECT * FROM keji3_copy1 where id>1591 " try: # 正确 # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 # db.commit() 这是在增删改的情况下 下这条语句是提交到数据库 results = cursor.fetchall() i = 0 for row in results: danwei = row[13].split(' ')[0] city=row[5] danwei1=city+'市'+danwei id=row[0] address,gj = get_formatted_address(danwei1) try: sql1 = "UPDATE keji3_copy1 SET guifandiyu ='%s',xzbm=%s where id = %s"% (address, gj, id) cursor1.execute(sql1) # 提交到数据库执行 db1.commit() except: print("2") # 打印结果 print(address,gj,id) except: # 发生错误 print("1") # 关闭数据库连接 db.close() mysql() ''' address,gj = get_formatted_address('天津市大港油田滩海勘探开发公司') print(address,gj) '''