Python3读取pdf文档,输出内容(txt)
from urllib.request import urlopen from pdfminer.pdfinterp import PDFResourceManager,process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from io import open import os import re def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content if __name__ == '__main__': # pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") filesdir = "D:\0.shenma\01.聊城资料政府工作报告\2019政府工作报告全文" os.chdir(filesdir) files = os.listdir() print(files) for file in files: if file.endswith(".pdf"): pdfFile = open(file, 'rb') outputString = readPDF(pdfFile) # print(outputString) try: outputString2 = outputString.replace(" ","") gdp = re.findall("生产总值(完成)?(.+?)亿元", outputString2)[0][1] print(file,"--","生产总值完成","--", gdp) ggyssr = re.findall("公共预算收入(完成)?(.+?),", outputString2)[0][1] print(file, "--", "一般公共预算收入完成","--", ggyssr) except: print(file, "--", "no data") # fh = open(file+".txt", 'w+', encoding="utf-8") # fh.write(outputString2) # fh.close() pdfFile.close()
【转自】:https://www.cnblogs.com/gooseeker/p/5527519.html
仅做记录,供查。