from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import threading, os class PdfThread(threading.Thread): def __init__(self, in_queue, doc_txt_dir): threading.Thread.__init__(self) self.in_queue = in_queue self.doc_txt_dir = doc_txt_dir def run(self): while True: try: codec = 'utf-8' in_fname = self.in_queue.get() rsrc = PDFResourceManager(caching = True) base_name = os.path.basename(in_fname) out_file = os.path.join(self.doc_txt_dir, base_name[0:base_name.rfind(".")] + ".txt") outfp = file(out_file, 'w') laparams = LAParams() #加上此参数可保留原pdf中的字符间空格 device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) fp = file(in_fname, 'rb') caching = True pagenos = set() process_pdf(rsrc, device, fp, pagenos, maxpages=0, password='',caching=caching, check_extractable=True) fp.close() device.close() outfp.close() print "have convert pdf file %s to file %s" %(in_fname, out_file) finally: self.in_queue.task_done() #TagExtractor