工作中遇到处理大批量数据的问题,大概有8w条左右的excel要取出来一一去调用api,得出的结果还需要进行数据过滤,然后再写回excel,单线程跑大概跑了2个多小时,实属麻烦,万一代码中有什么bug,快结束的时候报错的话..................,于是换成多线程,15个线程大概跑了10多分钟,太棒了,需要注意的是线程不能设置太多,不然server有可能没那么快回应,导致timeout
import requests from openpyxl import load_workbook,workbook from concurrent.futures import ThreadPoolExecutor import threading import time import json info = {} def sendGet(location): url = "https://xxxxxxx" name = threading.current_thread().name print(f"========================================第{len(info)}条========================================") params = { "origins": location[1], "destination": location[2], "type": "1", "key": "xxxxx" } res = requests.get(url, params) a = { location[0]:res.text } info.update(a) def getLocation(filepath): info = [] r = 2 rows = [] wb = load_workbook(filepath) sheet = wb["评测数据+计算"] #读取的2个location做对比 location1_list = sheet["B2":"B76058"] location2_list = sheet["D2":"D76058"] result = zip(location1_list,location2_list) for l1,l2 in result: listTotal = [] listTotal.append(r) listTotal.append(l1[0].value) listTotal.append(l2[0].value) r+=1 info.append(listTotal) return info def filter_data(info): for k,v in info.items(): result = json.loads(v) status = result.get("status") count = result.get("count") distance = "" if result.get("results"): distance = result.get("results")[0].get("distance") res = { "row": k, "status": status, "count": count, "distance": distance } yield res def write_excel(res_info): wb = load_workbook(filepath) sheet = wb["评测数据+计算"] for i in res_info: sheet["E%s" % i.get("row")] = i.get("distance") sheet["F%s" % i.get("row")] = i.get("status") sheet["G%s" % i.get("row")] = i.get("count") wb.save(filepath) if __name__ == '__main__': # filepath = r"C:UsersfengziDesktop性能横评_8月.xlsx" # filepath = r"C:UsersfengziDesktop est3.xlsx" filepath = r"C:UsersfengziDesktop性能横评_8月 new.xlsx" location_map = getLocation(filepath) with ThreadPoolExecutor(max_workers=15,thread_name_prefix="test-") as pool: pool.map(sendGet, location_map) res_info = filter_data(info) write_excel(res_info)