Python爬取高德大数据各项数据(拥堵区域信息,拥堵路段信息,所有重点城市信息等)
import datetime import json import threading import traceback import time from shlex import join from selenium import webdriver from selenium.webdriver import ChromeOptions from selenium.webdriver.chrome.options import Options from selenium.webdriver import Chrome import pymysql import requests from concurrent.futures import ThreadPoolExecutor from lxml import html import re option = ChromeOptions() option.add_experimental_option("excludeSwitches", ["enable-automation"]) option.add_argument("--headless") option.add_argument("--disable-gpu") option.add_argument('window-size=1920x3000') # 指定浏览器分辨率 option.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug option.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面 option.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度 option.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 option.binary_location = r"C:UsersLenovoAppDataLocalGoogleChromeApplicationchrome.exe" # 手动指定使用的浏览器位置 web = Chrome(options=option) # 把参数配置设置到浏览器中 etree = html.etree web.get("https://trp.autonavi.com/diagnosis/rank.do") ul=1 js=web.page_source obj1 = re.compile(r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result1 = obj1.finditer(js) for it in result1: ul = it.group('ul') j=json.loads(ul) #print(j) code=[] cityName=[] jiankangzhishu=[] yongduzhishu=[] speed=[] for i in range(0,101): code.append(j[i]["adcode"]) cityName.append(j[i]['cityName']) jiankangzhishu.append(j[i]["healthValue"]) yongduzhishu.append(j[i]['idx1']) speed.append(j[i]['realSpeed']) def get_conn(): conn = pymysql.connect( host="localhost", user="root", password="123456", db="jtsk", charset="utf8", port=3306, ) cursor = conn.cursor() return conn, cursor def close_conn(conn, cursor): if cursor: cursor.close() if conn: conn.close() def cityquyu(): cursor = None conn = None try: tm = time.strftime("%Y-%m-%d %H:%M", time.localtime()) conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from cityquyu' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) for j in range(0,101): url=f'https://trp.autonavi.com/ajax/districtRank.do?linksType=1&cityCode={code[j]}' resp=requests.get(url) js=resp.json() for i in js: cityname=cityName[j] name=i['name'] index=float(i['index']) speed=float(i['speed']) sql = "insert into cityquyu(city,name,zhishu,speed,time ) values(%s,%s,%s,%s,%s)" sql1="insert into cityquyu_all(city,name,zhishu,speed,time ) values(%s,%s,%s,%s,%s)" cursor.execute(sql, [cityname,name,index,speed,tm]) conn1, cursor1 = get_conn() cursor1.execute(sql1 ,[cityname,name,index,speed,tm]) conn1.commit() conn.commit() # 提交事务 update delete insert操作 close_conn(conn1, cursor1) except: traceback.print_exc() finally: close_conn(conn, cursor) def roadyuce(): cursor = None conn = None try: tm =(datetime.datetime.now()+datetime.timedelta(days=1)).strftime('%Y-%m-%d')#明天 conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from roadyuce' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) for j in range(0, 101): url = f'https://trp.autonavi.com/ajax/getCityRoadTop.do?adcode={code[j]}&date={tm}' resp = requests.get(url) js = resp.json() for i in range(0, len(js) - 1): sql = """ insert into roadyuce(name,roadname,zhishu,speed,dir,zuobiao) values(%s,%s,%s,%s,%s,%s) """ cursor.execute(sql, [cityName[j], js[i]["cityName"], js[i]["idx"], js[i]["speed"], js[i]["dir"] ,str(js[i]["lnglats"])]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() finally: close_conn(conn, cursor) def qxcity1(): cursor = None conn = None try: conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from qxcity1' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) url='https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode=100000&dt=2021-05-11&willReal=WILL&inOut=IN&size=50' url1='https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode=100000&dt=2021-05-11&willReal=WILL&size=50&inOut=OUT' web.get(url) ul = 1 js = web.page_source obj1 = re.compile( r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result1 = obj1.finditer(js) for it in result1: ul = it.group('ul') j = json.loads(ul) for i in range(0,len(j)): sql = "insert into qxcity1(code,name,zhishu) values(%s,%s,%s)" cursor.execute(sql, [j[i]["adcode"], j[i]["name"],j[i]["willIdx"]]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() finally: close_conn(conn, cursor) def qxcity2(): cursor = None conn = None try: conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from qxcity2' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) url='https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode=100000&dt=2021-05-11&willReal=WILL&size=50&inOut=OUT' web.get(url) ul = 1 js = web.page_source obj1 = re.compile( r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result1 = obj1.finditer(js) for it in result1: ul = it.group('ul') j = json.loads(ul) for i in range(0,len(j)): sql = "insert into qxcity2(code,name,zhishu) values(%s,%s,%s)" cursor.execute(sql, [j[i]["adcode"], j[i]["name"],j[i]["willIdx"]]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() finally: close_conn(conn, cursor) def cityroad(): cursor = None conn = None try: tm = time.strftime("%Y-%m-%d %H:%M", time.localtime()) conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from cityroad' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) for j in range(0,101): url=f'https://trp.autonavi.com/ajax/roadRank.do?roadType=0&timeType=0&cityCode={code[j]}' resp=requests.get(url) js=resp.json()['tableData'] for i in range(0,len(js)-1): # print(js[i]['name'],js[i]["dir"],js[i]["index"],js[i]["speed"],js[i]["length"]) # print(type(js[i]['name'])) # print(type(js[i]["dir"])) # print(type(js[i]["index"])) # print(type(js[i]["speed"])) # print(type(js[i]["length"])) x=str(js[i]["coords"]) sql=""" insert into cityroad(city,name,dir,zhishu,speed,length,time,coords) values(%s,%s,%s,%s,%s,%s,%s,%s) """ cursor.execute(sql, [cityName[j],js[i]["name"],js[i]["dir"],js[i]["index"],js[i]["speed"],js[i]["length"],tm,x]) conn.commit() # 提交事务 update delete insert操作 conn1, cursor1 = get_conn() sql1 =""" insert into cityroad_all(city,name,dir,zhishu,speed,length,time,coords) values(%s,%s,%s,%s,%s,%s,%s,%s) """ cursor1.execute(sql1, [cityName[j],js[i]["name"],js[i]["dir"],js[i]["index"],js[i]["speed"],js[i]["length"],tm,x]) conn1.commit() close_conn(conn1, cursor1) except: traceback.print_exc() finally: close_conn(conn, cursor) def qrcity(): cursor = None conn = None try: conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from qrcity' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) url = 'https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode=100000&dt=2021-05-11&willReal=WILL&inOut=IN&size=50' web.get(url) ul = 1 js = web.page_source obj1 = re.compile( r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result1 = obj1.finditer(js) for it in result1: ul = it.group('ul') j = json.loads(ul) for i in range(0, len(j)): url1=f'https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode={j[i]["adcode"]}&dt=2021-05-11&willReal=WILL&size=20&inOut=IN' web.get(url1) ul1 = 1 js1 = web.page_source obj11 = re.compile( r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result11 = obj11.finditer(js1) for it in result11: ul1 = it.group('ul') j1 = json.loads(ul1) for k in range(0,20): sql = "insert into qrcity(city,name) values(%s,%s)" cursor.execute(sql, [j[i]["name"],j1[k]["name"]]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() finally: close_conn(conn, cursor) def qccity(): cursor = None conn = None try: conn, cursor = get_conn() conn1, cursor1 = get_conn() sql = 'delete from qccity' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) url = 'https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode=100000&dt=2021-05-11&willReal=WILL&size=50&inOut=OUT' web.get(url) ul = 1 js = web.page_source obj1 = re.compile( r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result1 = obj1.finditer(js) for it in result1: ul = it.group('ul') j = json.loads(ul) for i in range(0, len(j)): url1=f'https://trp.autonavi.com/cityTravel/inAndOutCity.do?adcode={j[i]["adcode"]}&dt=2021-05-11&willReal=WILL&inOut=OUT&size=20' web.get(url1) ul1 = 1 js1 = web.page_source obj11 = re.compile( r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S) result11 = obj11.finditer(js1) for it in result11: ul1 = it.group('ul') j1 = json.loads(ul1) for k in range(0,20): sql = "insert into qccity(city,name) values(%s,%s)" cursor.execute(sql, [j[i]["name"],j1[k]["name"]]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() finally: close_conn(conn, cursor) def update_city(): cursor = None conn = None try: tm=time.strftime("%Y-%m-%d %H:%M", time.localtime()) conn, cursor = get_conn() conn1,cursor1=get_conn() sql='delete from city' cursor1.execute(sql) conn1.commit() close_conn(conn1, cursor1) for i in range(0,101): sql = "insert into city(code,cityName,jiankangzhishu,yongduzhishu,speed,time) values(%s,%s,%s,%s,%s,%s)" cursor.execute(sql,[code[i],cityName[i],jiankangzhishu[i],yongduzhishu[i],speed[i],tm]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() finally: close_conn(conn, cursor) def allrw(): print('===================================================================================') thr = threading.Timer(300, allrw) # 每5分钟 thr.start() now1 = datetime.datetime.now() print(f'{now1} ----- 开始执行') update_city() cityquyu() cityroad() roadyuce() now2 = datetime.datetime.now() print(f'{now2} ----- 执行结束 用时{now2 - now1}') print('=================================================================================== ') if __name__ == '__main__': allrw() print("ok")