camelot内置生成html文件的方法,但表格数据转化成pandas.dataframe的过程中,丢失了跨行跨列的结构信息,故生成html的表格无跨行跨列结构。
于是我在输出部分选择直接手写html表格..
import camelot import numpy as np import matplotlib.pyplot as plt import os import pandas as pd # def listdir(path, list_name): # 传入存储的list # for file in os.listdir(path): # file_path = os.path.join(path, file) # if os.path.isdir(file_path): # listdir(file_path, list_name) # else: # list_name.append(file_path) #批量文件 # filenames=[r'E:pdf_download'] # listdir('E:pdf_download',filenames) # for onefile in filenames: # filename=onefile.split(".", )[0] #单个文件 onefile=r'1202007288.pdf' print("loading...", onefile) tables = camelot.read_pdf(onefile,pages='28',strip_text=' . ',line_scale=80,split_text=True) for onetable in tables: mask = np.zeros((len(onetable.rows)+1, len(onetable.cols)+1)) colspan = np.ones((len(onetable.rows)+1, len(onetable.cols)+1)) rowspan = np.ones((len(onetable.rows)+1, len(onetable.cols)+1)) for onerow in onetable.cells: for onecell in onerow: thisrow = onetable.cells.index(onerow) thiscol = onerow.index(onecell) if mask[thisrow][thiscol] == 0: if not onecell.right: for i in range(thiscol,len(onerow)-1): if not onerow[i].right: mask[thisrow][i + 1] = 1 colspan[thisrow][thiscol] += 1 else: break if not onecell.bottom: for i in range(thisrow,len(onetable.cells)-1): if not onetable.cells[i][thiscol].bottom: mask[i + 1][thiscol] = 1 rowspan[thisrow][thiscol] += 1 else: break head='''<table border="1" class="dataframe"> <tbody>''' f = open(onefile + '-page'+str(onetable.page) + '-table-'+str(onetable.order)+'.html', 'w') f.write(head) for onerow in onetable.cells: writerow = ''' <tr>''' f.write(writerow) for onecell in onerow: thisrow = onetable.cells.index(onerow) thiscol = onerow.index(onecell) if mask[thisrow][thiscol] == 0: if int(colspan[thisrow][thiscol]) > 1: Colspan = 'colspan=' + str(int(colspan[thisrow][thiscol])) else: Colspan='' if int(rowspan[thisrow][thiscol]) > 1: Rowspan = 'rowspan=' + str(int(rowspan[thisrow][thiscol])) else: Rowspan = '' writecell = ''' <td %s %s>%s</td>'''%(Colspan,Rowspan,onecell.text) f.write(writecell) writerow = ''' </tr>''' f.write(writerow) f.close()