zoukankan      html  css  js  c++  java
  • Python——阶段总结(一)

    import xlrd # 读xlsx
    import xlsxwriter # 写xlsx
    import urllib.request # url请求,Python3自带,Python2与3中urllib的区别见:http://blog.csdn.net/Jurbo/article/details/52313636
    import os # 创建output文件夹
    import glob # 获取文件夹下文件名称
    import time # 记录时间
    import json # 读取json格式文件
    
    def xlsx_merge(folder,header,filename):
        fileList = []
        for fileName in glob.glob(folder + "*.xlsx"):
            fileList.append(fileName)
        fileNum = len(fileList)
        matrix = [None] * fileNum
        for i in range(fileNum):
            fileName = fileList[i]
            workBook = xlrd.open_workbook(fileName)
            try:
                sheet = workBook.sheet_by_index(0)
            except Exception as e:
                print(e)
            nRows = sheet.nrows
            matrix[i] = [0]*(nRows - 1)
            nCols = sheet.ncols
            for m in range(nRows - 1):
                matrix[i][m] = ["0"]* nCols
            for j in range(1,nRows):
                for k in range(nCols):
                    matrix[i][j-1][k] = sheet.cell(j,k).value
        fileName = xlsxwriter.Workbook(folder + filename + ".xlsx")
        sheet = fileName.add_worksheet("merged")
        for i in range(len(header)):
            sheet.write(0,i,header[i])
        rowIndex = 1
        for fileIndex in range(fileNum):
            for j in range(len(matrix[fileIndex])):
                for colIndex in range (len(matrix[fileIndex][j])):
                    sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex])
                rowIndex += 1
        print("已完成%d个文件的合并"%fileNum)
        fileName.close()
        
    def poi_by_adcode_poicode(folder,city_file = "city",poi_file = "poi",merge_or_not = 1):
        city_file = city_file
        poi_file = poi_file
        merge_or_not = merge_or_not
        header_full = ["id","name","type","typecode","biz_type","address","location","tel","pname","cityname","adname","rating","cost"] #返回结果控制为base时,输出的POI标签类别
        header = ["id","name","type","typecode","biz_type","address","location","tel","pname","cityname","adname"]
        offset = 25 # 实例设置每页展示10条POI(官方限定25条,实际测试可以为50)
        # 读取列表
        folder_sheet = xlrd.open_workbook(folder + "input/" + "folder.xlsx").sheet_by_index(0)
        folder_list = folder_sheet.col_values(0)
        folder_code_list = folder_sheet.col_values(1)
        city_sheet =  xlrd.open_workbook(folder+ "input/" + city_file + ".xlsx").sheet_by_index(0)
        city_list =city_sheet.col_values(0)
        city_code_list = city_sheet.col_values(1)
        poi_type_sheet = xlrd.open_workbook(folder+ "input/" + poi_file + ".xlsx").sheet_by_index(0)    
        poi_type_list = poi_type_sheet.col_values(1)
        total_work = (city_sheet.nrows - 1) * (poi_type_sheet.nrows - 1) # 指示工作总量
        city_col_index = 1 # 用于记录上次已经读取到的行数
        work_index = 1
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":抓取开始!")
        for folder_index in range(1,len(folder_list)): # 区分地级市
            output_folder = folder + folder_list[folder_index] +"/" # 创建输出路径
            if os.path.isdir(output_folder):
                pass
            else:
                os.makedirs(output_folder)
            for city_index in range(city_col_index,len(city_list)): # 对行政区
                if folder_code_list[folder_index][0:4] == city_code_list[city_index][0:4]: # 如果前四数字相同,则在该地级市目录下写入,否则退出循环
                    for poi_type_index in range(1,len(poi_type_list)): # 对兴趣点类别
                        workbook_file = output_folder + str(city_list[city_index]) + str(poi_type_list[poi_type_index]) + ".xlsx"
                        if os.path.exists(workbook_file):
                            print(str(city_list[city_index]) + str(poi_type_list[poi_type_index]) + " 已存在")
                        else:
                            workbook =xlsxwriter.Workbook(workbook_file) # 新建工作簿
                            sheet = workbook.add_worksheet("result") # 新建工作表
                            for col_index in range(len(header_full)):
                                sheet.write(0,col_index,header_full[col_index]) # 写表头
                            row_index = 1
                            for page_index in range(1, 101): # 制定行政区和兴趣点类别后,POI信息已固定, 现针对页数写入
                                try:
                                    url = "http://restapi.amap.com/v3/place/text?&keywords=&types=" + str(poi_type_list[poi_type_index]) + "&city=" + city_code_list[city_index] + "&citylimit=true&offset=" + str(offset) + "&page="+ str(page_index) + "&key=你的key&extensions=all"
                                    # 请求的结构化url地址如上,见:http://lbs.amap.com/api/webservice/guide/api/search/
                                    data = json.load(urllib.request.urlopen(url))["pois"]
                                    for i in range(offset):
                                        for col_index in range(len(header)):
                                            sheet.write(row_index, col_index, str(data[i][header[col_index]])) # 写入简略表头内容
                                            sheet.write(row_index,len(header),str(data[i]["biz_ext"]["rating"])) # 写入详细表头内容
                                            sheet.write(row_index,len(header) + 1,str(data[i]["biz_ext"]["cost"]))
                                        row_index += 1
                                except:
                                    break
                            workbook.close()
                            work_index = (city_index - 1) * len(poi_type_list) + poi_type_index
                            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + "" + str(city_list[city_index]) + " " + str(poi_type_list[poi_type_index]) + " 已获取!进度:%.3f%%"  %(work_index / total_work *100))
                    city_col_index += 1
                else:
                    break
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + "" + folder_list[folder_index] + "已完成!")
            
            if merge_or_not == 1:
                if os.path.exists(output_folder + folder_list[folder_index] + ".xlsx"):
                    pass
                else:
                    xlsx_merge(output_folder, header_full, folder_list[folder_index])
                    print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":已对文件进行合并!")
            else:
                print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":未进行合并!")
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":所有工作完成!")
        
    poi_by_adcode_poicode("E:/XXDir/","city", "poi",1)

    1、如果是将数据储存在excel中(存储在数据库中时,不用考虑),最好将结果分阶段保存成单独文件并及时输出时间和进度

    2、可以通过判断以前保存的文件是否存在达到断点续爬的目的,也可以通过此方式,实现多主机共享进度(农村人的分布式爬取^-^,通过建立共享文件夹)。

    3、做多重循环时,要考虑清楚循环之间的步骤应该置哪个循环之下。

    4、通过将复杂的功能拆分成多个小的功能,可以更好的完成一段复杂代码的编写。

    5、尽可能将实现的功能编写成函数和库,以便下次调用。

  • 相关阅读:
    Could not find package vendor/name in a version matching v-Number 是坑!
    Magento 2 Error: A technical problem with the server created an error. Try again to continue what you were doing. If the problem persists, try again later.
    七牛Qshell 常用命令打印
    VBA
    XUGUO-书呆子-搜索书箱
    Magento2 API 服务合同设计模式 依赖注入 介绍
    Magento2 Service contracts 服务合同
    Magento2自定义命令
    在Magento 2中创建管理员菜单
    Routing 为 Magento 2 一个重要的部分,本文介绍基本应用
  • 原文地址:https://www.cnblogs.com/shadrach/p/7873459.html
Copyright © 2011-2022 走看看