zoukankan      html  css  js  c++  java
  • pyautogui_pdf批量转换为TXT

    pyautogui_pdf批量转换为TXT,

    用pdf自带无损转换

    # -*- coding: utf-8 -*-
    """
    Created on Thu May  5 15:39:54 2016
    
    一定要有time.sleep(1)时间控制,否则出错
    pdf另存文本,效果可能很差
    typewrite("content") 用于输入文字
    typewrite(["right","left","up"]) 用于输入连续键盘按钮
    @author: daxiong
    """
    
    import pyautogui,time,os
    
    dir_file="C:/Users/daxiong/Desktop/test"
    #获取文件夹的文件名
    fileNames=os.listdir(dir_file)
    
    
    #打开存储PDF软件;(50,50)为pdf坐标
    pyautogui.doubleClick(50,50)
    time.sleep(1)
    
    
    for fileName in fileNames:
        #打开第一个PDF,按热键Ctrl+o即可
        pyautogui.hotkey("ctrl","o")
        time.sleep(1)
        #输入PDF文件名,并进入
        pyautogui.typewrite(fileName)
        time.sleep(1)
        pyautogui.press("enter")
        time.sleep(1)
        #另存为纯文本
        pyautogui.hotkey("shift","ctrl","s")
        time.sleep(1)
        pyautogui.press("tab") #切换到下面的保存格式
        time.sleep(1)
        pyautogui.press("down")
        time.sleep(1)
        pyautogui.typewrite(["down","down","down","down","down","down","down","down"
        ,"down","down","down","down","down","down","down","down","down","down"
        ,"down","down","enter"])
    
        #选择储存路径
        pyautogui.press('f4')  #定位地址栏
        time.sleep(1)
        pyautogui.hotkey("ctrl","a") #选中内容
        time.sleep(1)
        pyautogui.press('delete') #删除旧的地址
        time.sleep(1)
        pyautogui.typewrite(dir_file)
        time.sleep(1)
        #连续按下10个tab就到保存按钮
        pyautogui.typewrite(["tab","tab","tab","tab","tab","tab","tab","tab","tab",
        "tab","enter"])
        #关闭PDF,组合键ctrl+w
        time.sleep(2)
        pyautogui.hotkey("ctrl","w")
    
    pyautogui.hotkey("ctrl","q")
    

      

    办公室电脑测试代码

    pdf_to_txt

    # -*- coding: utf-8 -*-
    """
    Created on Thu May 12 11:22:57 2016
    pdf更换为最新版本,尝试提高转换成功率。
    txt必须转换为纯文本格式
    等待时间必须和pdf页码数匹配
    @author: Administrator
    """
    
    import pyautogui,time,os,PyPDF2
    
    dir_file="C:/Users/Administrator/Desktop/test/pdf/"
    #获取文件夹的文件名
    fileNames=os.listdir('.')
    pdf_fileNames=[i for i in fileNames if os.path.splitext(i)[1]==".pdf"]
     
      
    
    
    def Get_time(filename):
        try:
            pdfFileObj=open(filename,'rb')
            pdfReader=PyPDF2.PdfFileReader(pdfFileObj)
            pages=pdfReader.numPages #显示页数 在第4100行时读取pdfReader也会出错
            
        except:
            print ("wrong when read pdf:",filename)
            sleepTime=10
            return sleepTime
            
        if pages<=10:
            sleepTime=pages+2
        else:
            sleepTime=15
        
        return sleepTime
            
    
    
    #打开存储PDF软件;(50,50)为pdf坐标
    pyautogui.doubleClick(50,50)
    time.sleep(3)
    
    for fileName in fileNames:
        #打开第一个PDF,按热键Ctrl+o即可
        pyautogui.hotkey("ctrl","o")
        time.sleep(1)
        #输入PDF文件名,并进入
        pyautogui.typewrite(fileName)
        time.sleep(2)
        pyautogui.press("enter")
        time.sleep(1)
        #另存为纯文本
        pyautogui.hotkey("shift","ctrl","s")
        time.sleep(1)
        pyautogui.press("tab") #切换到下面的保存格式
        time.sleep(1)
        pyautogui.press("down")
                                            
        time.sleep(1)
        #不准确
        pyautogui.typewrite(["down","down","down","down","down","down","down","down"
        ,"down","down","down","down","down","down","down","down","down","down","enter","enter"])
     
        sleepTime=Get_time(fileName)
        #关闭PDF,组合键ctrl+w
        time.sleep(sleepTime)
        pyautogui.hotkey("ctrl","w")
     
    pyautogui.hotkey("ctrl","q")
    

      

    txt 文件包提取到excel

    # -*- coding: utf-8 -*-
    """
    Created on Thu May 12 14:05:06 2016
    1.先用filenameToExcel.exe程序导入文件名
    2.B11写入me_txt
    3.批量写入内容
    
    list不能写入cell,str才可以.txt必须是纯文本格式
    @author: Administrator
    """
    
    import PyPDF2,os,openpyxl,sys,time,xlrd
    from openpyxl.cell import get_column_letter,column_index_from_string
    
    #开始时间
    timeBegin=time.clock()
    excelFileName="test.xlsx"
    wb=openpyxl.load_workbook(excelFileName)  
    sheet=wb.active
    columnIndex="A"
    start=1
    expandName=".txt"
    expandName_upper=expandName.upper()
    
    excelFile = xlrd.open_workbook(excelFileName)  
    table = excelFile.sheet_by_index(0) #通过索引顺序获取  
    #A列的单元格
    cells_columnA=sheet.columns[0]
    #B列单元格
    cells_columnB=sheet.columns[1]
    
    #content="你好"
        
    def Get_col_values(i):
        list_col_values=table.col_values(i)
        list_col_values1=list_col_values[1:]
        return list_col_values1
        
        
    
    
    def single_txt_extract(filename,i):
        try:
            txtFileObj=open(filename)
            #不知道readlines()效果和readline相比如何,要测试
            content=txtFileObj.read()
            
        except:
            print ("wrong when read txt:",filename)
        
        cells_columnB[i+1].value=content   #list不能写入cell,str才可以.txt必须是纯文本格式
        txtFileObj.close()
        
    
    list_pdf_fileNames=Get_col_values(0)
    
    single_txt_extract("1151.txt",0)
    
    wb.save(excelFileName)
    

      

  • 相关阅读:
    Python网络协议(osi七层协议)
    Python面向对象之反射,双下方法
    Python类的成员
    Python异常处理
    mysql 索引 慢查询优化 && 数据库性能优化
    数据库(视图、事务、存储过程、函数) && 数据库备份
    mysql数据库连接模块 pymysql && sql注入
    主线程与子线程的关系
    socket 编程实例 基于线程池实现服务端并发
    日常迷惑积累
  • 原文地址:https://www.cnblogs.com/webRobot/p/5467914.html
Copyright © 2011-2022 走看看