zoukankan      html  css  js  c++  java
  • .doc 2 .docx可用代码

      1 # # -*- coding:utf-8 -*-
      2 #读取docx中的文本代码示例
      3 import docx
      4 from win32com import client as wc
      5 from pyhanlp import *
      6 import time
      7 import eventlet#导入eventlet这个模块
      8 import shutil
      9 word = wc.Dispatch('Word.Application')
     10 '''重启计时器'''
     11 def restart(time_start,time_end,sub_deadline):
     12     #E:pycharmWorkPlaceGraduation_projectUtilfile_process.py
     13     time_sub = time_end - time_start
     14 
     15     i = 0
     16     print("NJNNNNNNNNN",time_sub,"NNNNNNNNNNNNNNN")
     17     if (time_sub > sub_deadline):
     18         str = "CHCP 65001" + "&&"
     19         str += "E:" + "&&"
     20         str += r"cd E:pycharmWorkPlaceGraduation_projectUtil" + "&&"
     21         str += "python file_process.py "
     22         # print(str)
     23         print("TIME______", i, "________", time_sub)
     24         i = i + 1
     25         cmd = os.system(str)
     26         # print(cmd)
     27 '''文件操作'''
     28 #将doc转成docx
     29 def doSaveAas(doc_path,docx_path):
     30     # time_start = time.time()
     31     print("MMMMMMMMMMMMMMMMMMMMMMMMMMMMM")
     32     eventlet.monkey_patch()#必须加这条代码
     33     with eventlet.Timeout(10, False):  # 设置超时时间为2秒
     34         # print("*$$$$$$$$$$$$$$$$")
     35         # time.sleep(2)
     36         # print("^^^^^^^^^^^^")
     37         doc = word.Documents.Open(doc_path)  # 目标路径下的文件
     38         # print("@@@@@@@@@@@@")
     39         doc.SaveAs(docx_path, 12, False, "", True, "", False, False, False, False)  # 转化后路径下的文件
     40         doc.Close()
     41         print("《《《《《《《《《《《《《《《《《《《《《《《《《《《《《")
     42         # time_end = time.time()
     43         # restart(time_start,time_end,20)
     44 
     45 #将相对路径转换乘绝对路径,同时调用转换文件进行转换,同时再顺便删除之前的文件
     46 def Dir_doc2docx(Dir_path):
     47     i=0
     48     for file_name in os.listdir(Dir_path):
     49 
     50         print("********************************************************************************************")
     51         try:
     52             print("文件名:"+file_name)
     53             file_path = os.path.join(Dir_path, file_name)
     54             print("文件后缀:"+os.path.splitext(file_name)[1] )
     55             if os.path.splitext(file_name)[1] == '.doc':
     56                 i = i + 1
     57                 abs_file_path=os.path.abspath(file_path)
     58                 print(i," 绝对路径:"+abs_file_path)
     59                 doSaveAas(abs_file_path,abs_file_path+'x')
     60                 os.remove(file_path)
     61 
     62         except:
     63             continue
     64 
     65 def Get_num_file_end(Dir_path,end):
     66     i=0
     67     for file_name in os.listdir(Dir_path):
     68         print("********************************************************************************************")
     69         try:
     70             if os.path.splitext(file_name)[1] == end:
     71                 i=i+1
     72         except:
     73             continue
     74     return i
     75 #获取文件值
     76 def Get_file_value(Dir_path,file_name):
     77     paragraph_id=[]
     78     paragraph_value=[]
     79     file_path = os.path.join(Dir_path, file_name)
     80     file = docx.Document(file_path)
     81     # 输出段落编号及段落内容
     82     for i in range(len(file.paragraphs)):
     83         paragraph_id.append(i)
     84         paragraph_value.append(file.paragraphs[i].text.strip().replace(u'u3000', u'').replace(u'xa0', u'').replace(' ', ''))
     85     return paragraph_id,paragraph_value
     86 #移动文件
     87 def remove_file(Dir_path,To_dirpath):
     88     i = 0
     89     for file_name in os.listdir(Dir_path):
     90         print("********************************************************************************************")
     91         try:
     92             print("文件名:" + file_name)
     93             file_path = os.path.join(Dir_path, file_name)
     94             print("文件后缀:" + os.path.splitext(file_name)[1])
     95             if os.path.splitext(file_name)[1] == '.docx':
     96                 i = i + 1
     97                 abs_file_path = os.path.abspath(file_path)
     98                 abs_to_file_path=os.path.abspath(os.path.join(To_dirpath, file_name))
     99                 shutil.move(abs_file_path,abs_to_file_path)
    100                 print(i, " 绝对路径:" + abs_file_path)
    101                 print(i, " 目标绝对路径:" + abs_to_file_path)
    102 
    103         except:
    104             continue
    105 if __name__ =="__main__":
    106     print("AAAAAAAAA")
    107     # a=Get_num_doc("D:ATESTjie")
    108     Dir_doc2docx("D:ATESTjie")
    109     # remove_file(r"D:ATESTjie", r"D:ATEST	ojie")
    110     print("LLLLLLL")
    111     # word.Quit()
  • 相关阅读:
    flume sink两种类型 file_rool 自定义sing com.mycomm.MySink even if there is only one event, the event has to be sent in an array
    为什么引入进程20年后,又引入线程?
    As of Flume 1.4.0, Avro is the default RPC protocol.
    Google Protocol Buffer 的使用和原理
    Log4j 2
    统一日志 统一订单
    网站行为跟踪 Website Activity Tracking Log Aggregation 日志聚合 In comparison to log-centric systems like Scribe or Flume
    Percolator
    友盟吴磊:移动大数据平台的架构、实践与数据增值
    Twitter的RPC框架Finagle简介
  • 原文地址:https://www.cnblogs.com/smartisn/p/14408423.html
Copyright © 2011-2022 走看看