zoukankan      html  css  js  c++  java
  • python zip文件读取转存excel

    import os
    import re
    import zipfile
    import logging
    import requests
    from bs4 import BeautifulSoup
    from openpyxl import Workbook
    from openpyxl.utils import get_column_letter
    
    logging.basicConfig(level=logging.INFO,#控制台打印的日志级别
                        filename='new.log',
                        filemode='a',##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志,#a是追加模式,默认如果不写的话,就是追加模式
                        format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' #日志格式
                        )
    
    def Readzip(file_name):
        try:
            z = zipfile.ZipFile(file_name, 'r')
            # 打印zip文件中的文件列表
            guokanzhiguang_folder = 'guokanzhiguang'
            guokanzhiguang_list = []
            for filename in z.namelist():
                # sertch .txt
                print(filename)
                if filename.find(guokanzhiguang_folder) >= 0:
                    content = z.read(filename)
                    if len(content) == 0:
                        continue
                    guokanzhiguang_list.append(content)
            return guokanzhiguang_list
        except:
            return 'Readzip Running Faild!!'
    
    def getBookList(letter_lst):
        try:for html in letter_lst:
                soup = BeautifulSoup(html, 'html.parser')
                tag1 = soup.find_all('div', attrs={'class': "book-result-item-warp"})
                tag.append(tag1)
            return tag
        except:
            return 'getBookList Running Faild!!'
    
    def getBookElementInfo(letter_lst):
        try:
            alllist = getBookList(letter_lst)
            if len(alllist) > 0:
                print('文件个数:%d' % len(alllist))
                alldetialbookinfolst = []
                for lst in alllist :
                    for bookinfo in lst:
                        detialbookinfolst = []
                        center = bookinfo.find('div', attrs={'class': 'center'})
                        #杂志
                        title = re.sub(u"\(.*?\)|\{.*?}|\[.*?]", "", center.find("div", attrs={"class": "title"}).get_text())
                        detialbookinfolst.append(title)
                        allinfo = center.findAll('div', attrs={'class': "info"})
                        #国家
                        country = allinfo[0].get_text()
                        detialbookinfolst.append(country[4:])
                        # 因子
                        factor = allinfo[1].find('span', class_='field').get_text()
                        ifs = allinfo[1].find('span', class_='ifs').get_text()
                        diff = allinfo[1].find('span', class_='diff').get_text()
                        detialbookinfolst.append(ifs + " " + diff)
                        # 周期
                        period = allinfo[2].get_text()
                        detialbookinfolst.append(period[4:])
                        # 占比
                        ratio = allinfo[3].get_text()
                        detialbookinfolst.append(ratio[6:])
                        # 地址
                        addre = allinfo[4].find('a').get('href')
                        detialbookinfolst.append(addre)
                        # 自引
                        cited_rate = allinfo[5].get_text()
                        detialbookinfolst.append(cited_rate[5:])
                        # print("+++++++++++++++++++++++++++++++++++")
                        alldetialbookinfolst.append(detialbookinfolst)
                return alldetialbookinfolst
            else:
                print('txt文件不存在或内容为空!!!')
                return ''
        except:
            return 'getBookElementInfo Running Faild!!'
    
    def Insert2Excel(bookinfo):
        # 插入数据
        try:
            tableTitle = ['杂志', '国家', '因子', '周期', '占比', '地址', '自引']
            wb = Workbook()
            ws = wb.active
            ws.title = 'gk_sheet'
            ws.append(tableTitle)
            work_name = 'gkbookinfolist.xlsx'
            for i in range(1, ws.max_column + 1):
                ws.column_dimensions[get_column_letter(i)].width = 15
            for info in bookinfo :
                ws.append(info)
            wb.save(work_name)
            return 'Insert Excel succcessfully!'
        except:
            return 'Insert Excel failed!'
    
    if __name__ == '__main__':
        path = os.getcwd()
        letter_lst = Readzip('bookinfo.zip')
        bookinfo = getBookElementInfo(letter_lst)
        #写excel
        print(Insert2Excel(bookinfo))
  • 相关阅读:
    AC自动机模板
    2013 ACM/ICPC Asia Regional Changsha Online–C (模拟)
    Codeforces126B
    Codeforces182D
    Codeforces149E
    POJ3080
    POJ2752
    HDU4745
    HDU4737
    POJ1226
  • 原文地址:https://www.cnblogs.com/ouzai/p/13723707.html
Copyright © 2011-2022 走看看