zoukankan      html  css  js  c++  java
  • python 分析PDF文件 (基于使用pdf2htmlEX.exe python3.6)

    from html.parser import HTMLParser
    import json
    import re
    from openpyxl import Workbook
    from openpyxl.utils import get_column_letter
    from itertools import islice
    import subprocess
    import os
    import shutil




    def runApp(command, message=''):
    stdoutput = None
    erroutput = None
    for retryFlag in range(3):
    try:
    p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
    stderr=subprocess.PIPE)
    (stdoutput, erroutput) = p.communicate(message.encode(), timeout=30)
    break
    except Exception as e:
    if retryFlag != 2:
    continue
    else:
    raise Exception("Error %s"%command)
    return stdoutput, erroutput

    class MyHTMLParser(HTMLParser):
    def __init__(self):
    HTMLParser.__init__(self)
    self.links = []
    self.handledtags = ['div']
    self.processState = 0
    self.fdata={"meta":{},"item":{},"card":{}}
    self.stpdf = False
    self.x3 = False
    self.lastmeta = ""
    self.itemNum = 0
    self.cardNum = 0

    def _attr(self,attrlist, attrname):
    for each in attrlist:
    if attrname == each[0]:
    return each[1]
    return ""

    def handle_starttag(self, tag, attrs):
    if "page-container" == self._attr(attrs, 'id'):
    self.stpdf = True
    if "x3" in self._attr(attrs, 'class'):
    self.x3 = True



    def handle_data(self,data):
    if not self.stpdf:
    return
    if self.processState == 0:
    if len(data.split())>0 and data.split()[0].endswith((":",":")):
    data = data.split()
    lastCon = 0
    for i in range(0,len(data)):
    if data[i].endswith((":",":")): #判断data结尾(":",":")
    lastCon = i
    self.lastmeta = data[i][:-1]
    else:
    self.fdata["meta"][data[lastCon][:-1]] = data[i]
    elif "经费项目" in data:
    self.fdata["meta"]["项目"] = data[4:]
    self.processState = 1
    elif self.x3 :
    self.x3 = False
    if len(self.lastmeta) >0 and self.lastmeta in self.fdata["meta"] and len(self.fdata["meta"][self.lastmeta]) > 0:
    self.fdata["meta"][self.lastmeta] += data
    elif self.processState == 1:
    data = data.split() #字符串切割
    if "合计金额(小写):" in data[0]:
    self.fdata["item"]["total"] = data[-1]
    self.processState = 2
    if self.processState == 2:
    if "结算信息" in data:
    self.processState = 3
    if self.processState == 3:
    data = data.split()
    if data[0] in map(lambda x: str(x), [i for i in range(100)]):
    self.fdata["card"][data[0]] = data[1:]
    self.cardNum += 1
    elif "预约报销日期" in data[0]:
    self.fdata["card"]["date"] = " ".join(data[1:])
    self.processState = 4

    return


    # def pdf2csv(pdf):
    if __name__ == '__main__':
    n=0
    # pdfFile = sys.argv[0]
    addressPDF = "E:/totally/FinancePDF_travel/"

    f_list = os.listdir(addressPDF)

    for fileNAME in f_list:
    try:
    if os.path.splitext(fileNAME)[1] == '.pdf':

    pdfFile=addressPDF +fileNAME
    pdfFile=pdfFile
    #print(pdfFile)
    htmlFile = pdfFile[:-4] + ".html"
    xlsxFile= pdfFile +".xls"

    s,e = runApp('pdf2htmlEX "%s"'%(pdfFile))
    try:
    html_code = re.sub("<span.+?</span>"," ",open(htmlFile,encoding = "UTF-8").read())
    except Exception as e2:
    print(e)
    print(e2)
    hp = MyHTMLParser()
    hp.feed(html_code)
    hp.close()

    # print(json.dumps(hp.fdata,indent=4))


    wb = Workbook()

    ws = wb.active

    title = ["编号", "项目负责人", "项目", "报销事由", "费用合计", "预约报销日期", "结算信息2*", "", ""]
    c = iter(range(len(title)))
    for i in c:
    t = title[i][:-2] if title[i].endswith('*') else title[i]
    ws["%s1"% (chr(ord('A')+i))] = t
    if title[i][-1] == "*":
    crs = int(title[i][-2])
    ws.merge_cells("%s1:%s1"%(chr(ord('A')+i),chr(ord('A')+i+crs)))
    next(islice(c, crs, crs), None)
    # i += crs


    MergeBoxNum = max(hp.cardNum,hp.itemNum)
    if MergeBoxNum == 0:
    MergeBoxNum ==1
    else:
    c = iter(range(len(title)))
    for i in c:
    if title[i].endswith("*"):
    crs = int(title[i][-2])
    next(islice(c, crs, crs), None)
    continue
    ws.merge_cells("%s2:%s%d"% (chr(ord('A')+i),
    chr(ord('A')+i),
    1 + MergeBoxNum))

    vfunc = [lambda x: x["meta"]["报销单号"],
    lambda x: x["meta"]["项目负责人"],
    lambda x: x["meta"]["项目"],
    lambda x: x["meta"]["报销事由"],
    lambda x: x["item"]["total"],
    lambda x: x["card"]["date"],
    lambda x: [n[1] for n in filter(lambda k:
    k[0] in map(lambda x: str(x), [i for i in range(100)])
    , x["card"].items())],
    lambda x: [n[1] for n in filter(lambda k:
    k[0] in map(lambda x: str(x), [i for i in range(100)])
    , x["item"].items())]


    vfuncID = 0
    for i in range(len(title)):
    if len(title[i]) == 0:
    continue
    dat = vfunc[vfuncID](hp.fdata)
    if type(dat) == list:
    if "结算信息" in title[i]:
    for j in range(len(dat)):
    if len(dat[j]) == 0:
    ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = "null"
    elif len(dat[j]) <= 2 and len(dat[j]) > 0 :
    ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
    ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
    else:
    ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
    ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
    ws["%s%d" % (chr(ord('A') + i + 2), 2 + j + 1)] = dat[j][2]
    ws["%s%d" % (chr(ord('A') + i + 3), 2 + j + 1)] = dat[j][3]
    else:
    ws["%s2" % (chr(ord('A') + i))] = dat

    vfuncID += 1
    column_widths = []
    for row in ws:
    for i, cell in enumerate(row):
    if not cell.value:
    continue
    if len(column_widths) > i:
    if len(cell.value) > column_widths[i]:
    column_widths[i] = len(cell.value)
    else:
    column_widths += [len(cell.value)] if cell.value else [0]

    for i, column_width in enumerate(column_widths):
    ws.column_dimensions[get_column_letter(i + 1)].width = min(42,column_width * 1.7)
    wb.save(xlsxFile)
    except:
    name_OVER=os.path.splitext(fileNAME)
    if name_OVER[1] == '.pdf':
    n += 1
    print(str(n) + '.' + '无法解析' + fileNAME + '文件')
    oldname = u"E:\totally\FinancePDF_travel\" + fileNAME
    newname = u"E:\totally\bad_file\" + fileNAME
    shutil.copyfile(oldname, newname)
    print('已复制' + fileNAME + '文件')
    continue
    else:
    continue
     
  • 相关阅读:
    metal的gpu query
    体积雾 global fog unity 及改进
    hdr rt format对颜色的影响
    unity deferred lighting
    unity linear space时 photoshop blend的正确设置
    unity linear work flow
    一些数据 bandwidth之类
    deferred rendering with msaa
    unity 显示mipmaplevel
    【转】在C#中使用SendMessage
  • 原文地址:https://www.cnblogs.com/setname/p/8417808.html
Copyright © 2011-2022 走看看