zoukankan      html  css  js  c++  java
  • 用python实现一个文档小工具(支持文档关键字筛选)

    功能:根据关键词批量从doc、docx、pdf文件中筛选出包含所输入关键词的文件

    那么开始上代码,不是专业python程序猿,代码写的不好勿喷,哈哈

    from PyQt5.QtWidgets import *
    from PyQt5.QtGui import *
    from PyQt5.QtCore import *
    import sys, os
    import docx
    from docx import Document
    import os
    import shutil
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice
    
    class Window(QDialog):
        def __init__(self, parent=None):
            super(Window, self).__init__(parent)
            self.path = ''
            self.initUI()
            self.setWindowTitle("文件小助手")
            self.resize(240, 200)
    
        def initUI(self):
            grid = QGridLayout()
    
            grid.addWidget(QLabel("源路径:"), 0, 0)
            self.pathLineEdit = QLineEdit()
            self.pathLineEdit.setFixedWidth(200)
            self.pathLineEdit.setText(self.path)
            grid.addWidget(self.pathLineEdit, 0, 1)
            button = QPushButton("选择文件夹")
            grid.addWidget(button, 0, 3)
            button.clicked.connect(self.msg)
    
            grid.addWidget(QLabel("输出路径:"), 1, 0)
            self.pathLineEdit1 = QLineEdit()
            self.pathLineEdit1.setFixedWidth(200)
            self.pathLineEdit1.setText(self.path)
            grid.addWidget(self.pathLineEdit1, 1, 1)
            button = QPushButton("选择文件夹")
            grid.addWidget(button, 1, 3)
            button.clicked.connect(self.msg1)
    
            # create textbox
            grid.addWidget(QLabel("关键字:"), 2, 0)
            self.textbox = QLineEdit(self)
            self.textbox.move(20, 20)
            self.textbox.resize(180, 30)
            grid.addWidget(self.textbox, 2, 1)
    
            # Create a button in the window
            self.button1 = QPushButton('点我开始干活儿', self)
            grid.addWidget(self.button1, 3, 1)
            self.setLayout(grid)
            fileDir = self.pathLineEdit.text()
            keyword = self.textbox.text()
            self.button1.clicked.connect(lambda : self.working(self.pathLineEdit,self.pathLineEdit1,self.textbox))
    
        def msg(self):
            dir = QFileDialog.getExistingDirectory(self,"选取文件夹","./")  # 起始路径
            self.pathLineEdit.setText(dir)
            print(dir)
    
        def msg1(self):
            dir = QFileDialog.getExistingDirectory(self, "选取文件夹", "./")  # 起始路径
            self.pathLineEdit1.setText(dir)
            print(dir)
    
        #word 解析器
        def readDoc(self,root,path,target,key):
            #将doc文件改为docx
            filename = path[-3:]
            if filename == 'doc':
                name = os.path.basename(path)
                os.rename(path,root+'/'+name+'x')
                path = path+'x'
    
            flag = False
            try:
                document = Document(path)
            except:
                return
            else:
                for paragraph in document.paragraphs:
                    if key in paragraph.text:
                        flag = True
                        self.copyFile(target,path)
                        break
                if flag == False:
                    tables = document.tables
                    for table in tables:
                        # 行列个数
                        row_count = len(table.rows)
                        col_count = len(table.columns)
                        for i in range(row_count):
                            for j in range(col_count):
                                if key in table.cell(i, j).text:
                                    self.copyFile(target, path)
                                    break
    
        #pdf文件解析器
        def readPdf(self,root,path,target,key):
            # 获取文档对象
            fp = open(path, "rb")
    
            # 创建一个一个与文档关联的解释器
            parser = PDFParser(fp)
    
            # PDF文档的对象
            doc = PDFDocument()
    
            # 连接解释器和文档对象
            parser.set_document(doc)
            doc.set_parser(parser)
    
            # 初始化文档,当前文档没有密码,设为空字符串
            doc.initialize("")
    
            # 创建PDF资源管理器
            resource = PDFResourceManager()
    
            # 参数分析器
            laparam = LAParams()
    
            # 创建一个聚合器
            device = PDFPageAggregator(resource, laparams=laparam)
    
            # 创建PDF页面解释器
            interpreter = PDFPageInterpreter(resource, device)
    
            # 使用文档对象得到页面的集合
            for page in doc.get_pages():
                # 使用页面解释器读取
                interpreter.process_page(page)
    
                # 使用聚合器来获得内容
                layout = device.get_result()
    
                for out in layout:
                    if hasattr(out, "get_text"):
                        txt = out.get_text()
                        if key in txt:
                            self.copyFile(target,path)
                            break
    
        # 复制文件
        def copyFile(self, path, oldname):
            hasFile = os.path.exists(path)
            if hasFile == True:
                name = os.path.basename(oldname)
                shutil.copyfile(oldname, path + '/' + name)
            else:
                os.mkdir(path)
                name = os.path.basename(oldname)
                shutil.copyfile(oldname, path + '/' + name)
    
        # 开始干活儿
        @pyqtSlot()
        def working(self,pathLineEdit1,pathLineEdit2,textbox):
            sourcedir = pathLineEdit1.text()
            targetdir = pathLineEdit2.text()
            key = textbox.text()
            msg = '处理好了'
            if sourcedir.strip() == '':
                msg = '源路径不能为空'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
                return
            if targetdir.strip() == '':
                msg = '输出路径不能为空'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
                return
            if key.strip() == '':
                msg = '关键字不能为空'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
                return
            # 处理文件
            if sourcedir.strip() != '' and targetdir.strip() != '' and key.strip() != '':
                flag = False
                for root, dirs, files in os.walk(sourcedir):
                    for file in files:
                        diricto = os.path.join(root, file)
                        filetype = diricto[-4:]
                        if 'doc' in filetype:
                            self.readDoc(root, diricto, targetdir, key)
                            flag = True
                        if 'pdf' in filetype:
                            self.readPdf(root, diricto, targetdir, key)
                            flag = True
    
                if flag == False :
                    msg = '源路径中没有word和pdf文件'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
    
    if __name__ == '__main__':
        app = QApplication(sys.argv)
        dialog = Window()
        if dialog.exec_():
            pass

    工具演示效果图如下

    工具下载链接:  https://pan.baidu.com/s/1w7CQUAowSgR_d6V2h5OlwA  密码:kyuy


    文末小福利免费视频资源网站:www.sousuohou.com
  • 相关阅读:
    【css】怎么让Chrome支持小于12px 的文字
    java操作linux,调用shell命令
    20个非常有用的Java程序片段
    Java集合详解
    SVN使用指南
    利用SQL语句查询数据库中所有表
    HttpClient-03Http状态管理
    HttpClient-02连接管理
    HttpClient-01基本概念
    IDEA安装插件
  • 原文地址:https://www.cnblogs.com/vicF/p/9803566.html
Copyright © 2011-2022 走看看