pyautogui_pdf批量转换为TXT,
用pdf自带无损转换
# -*- coding: utf-8 -*- """ Created on Thu May 5 15:39:54 2016 一定要有time.sleep(1)时间控制,否则出错 pdf另存文本,效果可能很差 typewrite("content") 用于输入文字 typewrite(["right","left","up"]) 用于输入连续键盘按钮 @author: daxiong """ import pyautogui,time,os dir_file="C:/Users/daxiong/Desktop/test" #获取文件夹的文件名 fileNames=os.listdir(dir_file) #打开存储PDF软件;(50,50)为pdf坐标 pyautogui.doubleClick(50,50) time.sleep(1) for fileName in fileNames: #打开第一个PDF,按热键Ctrl+o即可 pyautogui.hotkey("ctrl","o") time.sleep(1) #输入PDF文件名,并进入 pyautogui.typewrite(fileName) time.sleep(1) pyautogui.press("enter") time.sleep(1) #另存为纯文本 pyautogui.hotkey("shift","ctrl","s") time.sleep(1) pyautogui.press("tab") #切换到下面的保存格式 time.sleep(1) pyautogui.press("down") time.sleep(1) pyautogui.typewrite(["down","down","down","down","down","down","down","down" ,"down","down","down","down","down","down","down","down","down","down" ,"down","down","enter"]) #选择储存路径 pyautogui.press('f4') #定位地址栏 time.sleep(1) pyautogui.hotkey("ctrl","a") #选中内容 time.sleep(1) pyautogui.press('delete') #删除旧的地址 time.sleep(1) pyautogui.typewrite(dir_file) time.sleep(1) #连续按下10个tab就到保存按钮 pyautogui.typewrite(["tab","tab","tab","tab","tab","tab","tab","tab","tab", "tab","enter"]) #关闭PDF,组合键ctrl+w time.sleep(2) pyautogui.hotkey("ctrl","w") pyautogui.hotkey("ctrl","q")
办公室电脑测试代码
pdf_to_txt
# -*- coding: utf-8 -*- """ Created on Thu May 12 11:22:57 2016 pdf更换为最新版本,尝试提高转换成功率。 txt必须转换为纯文本格式 等待时间必须和pdf页码数匹配 @author: Administrator """ import pyautogui,time,os,PyPDF2 dir_file="C:/Users/Administrator/Desktop/test/pdf/" #获取文件夹的文件名 fileNames=os.listdir('.') pdf_fileNames=[i for i in fileNames if os.path.splitext(i)[1]==".pdf"] def Get_time(filename): try: pdfFileObj=open(filename,'rb') pdfReader=PyPDF2.PdfFileReader(pdfFileObj) pages=pdfReader.numPages #显示页数 在第4100行时读取pdfReader也会出错 except: print ("wrong when read pdf:",filename) sleepTime=10 return sleepTime if pages<=10: sleepTime=pages+2 else: sleepTime=15 return sleepTime #打开存储PDF软件;(50,50)为pdf坐标 pyautogui.doubleClick(50,50) time.sleep(3) for fileName in fileNames: #打开第一个PDF,按热键Ctrl+o即可 pyautogui.hotkey("ctrl","o") time.sleep(1) #输入PDF文件名,并进入 pyautogui.typewrite(fileName) time.sleep(2) pyautogui.press("enter") time.sleep(1) #另存为纯文本 pyautogui.hotkey("shift","ctrl","s") time.sleep(1) pyautogui.press("tab") #切换到下面的保存格式 time.sleep(1) pyautogui.press("down") time.sleep(1) #不准确 pyautogui.typewrite(["down","down","down","down","down","down","down","down" ,"down","down","down","down","down","down","down","down","down","down","enter","enter"]) sleepTime=Get_time(fileName) #关闭PDF,组合键ctrl+w time.sleep(sleepTime) pyautogui.hotkey("ctrl","w") pyautogui.hotkey("ctrl","q")
txt 文件包提取到excel
# -*- coding: utf-8 -*- """ Created on Thu May 12 14:05:06 2016 1.先用filenameToExcel.exe程序导入文件名 2.B11写入me_txt 3.批量写入内容 list不能写入cell,str才可以.txt必须是纯文本格式 @author: Administrator """ import PyPDF2,os,openpyxl,sys,time,xlrd from openpyxl.cell import get_column_letter,column_index_from_string #开始时间 timeBegin=time.clock() excelFileName="test.xlsx" wb=openpyxl.load_workbook(excelFileName) sheet=wb.active columnIndex="A" start=1 expandName=".txt" expandName_upper=expandName.upper() excelFile = xlrd.open_workbook(excelFileName) table = excelFile.sheet_by_index(0) #通过索引顺序获取 #A列的单元格 cells_columnA=sheet.columns[0] #B列单元格 cells_columnB=sheet.columns[1] #content="你好" def Get_col_values(i): list_col_values=table.col_values(i) list_col_values1=list_col_values[1:] return list_col_values1 def single_txt_extract(filename,i): try: txtFileObj=open(filename) #不知道readlines()效果和readline相比如何,要测试 content=txtFileObj.read() except: print ("wrong when read txt:",filename) cells_columnB[i+1].value=content #list不能写入cell,str才可以.txt必须是纯文本格式 txtFileObj.close() list_pdf_fileNames=Get_col_values(0) single_txt_extract("1151.txt",0) wb.save(excelFileName)