zoukankan      html  css  js  c++  java
  • Excel、PDF文档解析

    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed, PDFPage
    from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
    from pdfminer.layout import LAParams, LTTextBoxHorizontal
    from pdfminer.converter import PDFPageAggregator
    
    import sys
    import xlrd
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    
    def pdf_transform_text():
        print "开始解析pdf" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        content = ''
        fp = open('tempPdfFile_new3.pdf', 'rb')
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        # 创建一个PDF文档对象存储文档结构
        document = PDFDocument(parser)
        # 检查文件是否允许文本提取
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr = PDFResourceManager()
            # 设定参数进行分析
            laparams = LAParams()
            # 创建一个PDF设备对象
            # device=PDFDevice(rsrcmgr)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个PDF解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                # 接受该页面的LTPage对象
                layout = device.get_result()
                for x in layout:
                    if (isinstance(x, LTTextBoxHorizontal)):
                        content += x.get_text().encode('utf-8') + '
    '
        print "解析pdf成功" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return content.decode("utf-8")
    
    def parse_excel(url, filename):
        print '开始解析excel文档', filename, url
        req = WebRequests()
        inputStream = req.get(url, timeout=180)
        # 写入到本地暂存,方便解析成文本
        with open("tempExcelFile_new3.xls", "wb") as xls:
            xls.write(inputStream.content)
        data = xlrd.open_workbook("tempExcelFile_new3.xls")
        for i in range(len(data.sheets())):
            table = data.sheets()[i]
            print table.name
            print table.nrows
            for i in range(2, table.nrows):
                if len(table.row_values(i))>=12:
                    result = {}
                    result['company_name'] = table.row_values(i)[1]
                    result['province'] = table.row_values(i)[2]
                    result['industry'] = table.row_values(i)[3]
                    result['broker_company'] = table.row_values(i)[4]
                    result['broker_person'] = table.row_values(i)[5]
                    result['law_firm'] = table.row_values(i)[6]
                    result['laywyer'] = table.row_values(i)[7]
                    result['accounting_firm'] = table.row_values(i)[8]
                    result['accountant'] = table.row_values(i)[9]
                    # result[''] = table.row_values(i)[10] #挂牌同时发行((是/否))
                    result['progress'] = table.row_values(i)[11]
                    if len(table.row_values(i))==13:
                        result['receive_date'] = table.row_values(i)[12]
                    saveOrUpdateNew3CompanyBaseInfo(result)

    注意:Excelt在解析时要看一下有没有多个sheet。该pdf解析只能解析文本内容的pdf

  • 相关阅读:
    阿里巴巴2015年校招笔试附加题
    hadoop eclipse插件生成
    DevExpress控件的安装及画图控件的使用
    计算二进制数的0的个数
    Docker初探
    AppStore App申请审核加速
    _DataStructure_C_Impl:LinkListBasedSort
    rman数据库恢复;关键/非重要文件、影像副本、控制文件、还原点、非归档、增量、新数据库、灾难性回复
    RenderScript on LLVM笔记
    Oracle数据库备份恢复,巡检须要关注的对象设置以及相关恢复概述
  • 原文地址:https://www.cnblogs.com/liangping/p/7669988.html
Copyright © 2011-2022 走看看