zoukankan      html  css  js  c++  java
  • python word

     代码:

      1 #coding=utf-8
      2 __author__ = 'zhm'
      3 from win32com import client as wc
      4 import os
      5 import time
      6 import random
      7 import MySQLdb
      8 import re
      9 def wordsToHtml(dir):
     10 #批量把文件夹的word文档转换成html文件
     11  #金山WPS调用,抢先版的用KWPS,正式版WPS
     12  word = wc.Dispatch('KWPS.Application')
     13  for path, subdirs, files in os.walk(dir):
     14   for wordFile in files:
     15    wordFullName = os.path.join(path, wordFile)
     16    #print "word:" + wordFullName
     17    doc = word.Documents.Open(wordFullName)
     18    wordFile2 = unicode(wordFile, "gbk")
     19    dotIndex = wordFile2.rfind(".")
     20    if(dotIndex == -1):
     21     print '********************ERROR: 未取得后缀名!'
     22    fileSuffix = wordFile2[(dotIndex + 1) : ]
     23    if(fileSuffix == "doc" or fileSuffix == "docx"):
     24     fileName = wordFile2[ : dotIndex]
     25     htmlName = fileName + ".html"
     26     htmlFullName = os.path.join(unicode(path, "gbk"), htmlName)
     27     # htmlFullName = unicode(path, "gbk") + "\" + htmlName
     28     print u'生成了html文件:' + htmlFullName
     29     doc.SaveAs(htmlFullName, 8)
     30     doc.Close()
     31  word.Quit()
     32  print ""
     33  print "Finished!"
     34 def html_add_to_db(dir):
     35 #将转换成功的html文件批量插入数据库中。
     36  conn = MySQLdb.connect(
     37   host='localhost',
     38   port=3306,
     39   user='root',
     40   passwd='root',
     41   db='test',
     42   charset='utf8'
     43   )
     44  cur = conn.cursor()
     45  for path, subdirs, files in os.walk(dir):
     46   for htmlFile in files:
     47    htmlFullName = os.path.join(path, htmlFile)
     48    title = os.path.splitext(htmlFile)[0]
     49    targetDir = 'D:/files/htmls/'
     50    #D:/files为web服务器配置的静态目录
     51    sconds = time.time()
     52    msconds = sconds * 1000
     53    targetFile = os.path.join(targetDir, str(int(msconds))+str(random.randint(100, 10000)) +'.html')
     54    htmlFile2 = unicode(htmlFile, "gbk")
     55    dotIndex = htmlFile2.rfind(".")
     56    if(dotIndex == -1):
     57     print '********************ERROR: 未取得后缀名!'
     58    fileSuffix = htmlFile2[(dotIndex + 1) : ]
     59    if(fileSuffix == "htm" or fileSuffix == "html"):
     60     if not os.path.exists(targetDir):
     61      os.makedirs(targetDir)
     62     htmlFullName = os.path.join(unicode(path, "gbk"), htmlFullName)
     63     htFile = open(htmlFullName,'rb')
     64     #获取网页内容
     65     htmStrCotent = htFile.read()
     66     #找出里面的图片
     67     img=re.compile(r"""<imgs.*?s?srcs*=s*['|"]?([^s'"]+).*?>""",re.I)
     68     m = img.findall(htmStrCotent)
     69     for tagContent in m:
     70      imgSrc = unicode(tagContent, "gbk")
     71      imgSrcFullName = os.path.join(path, imgSrc)
     72      #上传图片
     73      imgTarget = 'D:/files/images/whzx/'
     74      img_sconds = time.time()
     75      img_msconds = sconds * 1000
     76      targetImgFile = os.path.join(imgTarget, str(int(img_msconds))+str(random.randint(100, 10000)) +'.png')
     77      if not os.path.exists(imgTarget):
     78       os.makedirs(imgTarget)
     79      if not os.path.exists(targetImgFile) or(os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) != os.path.getsize(imgSrcFullName))):
     80       tmpImgFile = open(imgSrcFullName,'rb')
     81       tmpWriteImgFile = open(targetImgFile, "wb")
     82       tmpWriteImgFile.write(tmpImgFile.read())
     83       tmpImgFile.close()
     84       tmpWriteImgFile.close()
     85       htmStrCotent=htmStrCotent.replace(tagContent,targetImgFile.split(":")[1])
     86     if not os.path.exists(targetFile) or(os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(htmlFullName))):
     87      #用iframe包装转换好的html文件。
     88      iframeHtml='''
     89      <script type="text/javascript" language="javascript">
     90       function iFrameHeight() {
     91        var ifm= document.getElementById("iframepage");
     92        var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument;
     93        if(ifm != null && subWeb != null) {
     94         ifm.height = subWeb.body.scrollHeight;
     95        }
     96       }
     97      </script>
     98      <iframe src='''+targetFile.split(':')[1]+'''
     99       marginheight="0" marginwidth="0" frameborder="0" scrolling="no" width="765" height=100% id="iframepage" name="iframepage" onLoad="iFrameHeight()" ></iframe>
    100      '''
    101      tmpTargetFile = open(targetFile, "wb")
    102      tmpTargetFile.write(htmStrCotent)
    103      tmpTargetFile.close()
    104      htFile.close()
    105      try:
    106       # 执行
    107       sql = "insert into common_article(title,content) values(%s,%s)"
    108       param = (unicode(title, "gbk"),iframeHtml)
    109       cur.execute(sql,param)
    110      except:
    111       print "Error: unable to insert data"
    112  cur.close()
    113  conn.commit()
    114  # 关闭数据库连接
    115  conn.close()
    116 if __name__ == '__main__':
    117  wordsToHtml('d:/word')
    118  html_add_to_db('d:/word')
  • 相关阅读:
    Android-使用AIDL挂断电话
    新变化---转战新博客
    Spring Cloud Config 分布式配置中心【Finchley 版】
    Spring Boot2.0 整合 Kafka
    Spring Cloud 分布式链路跟踪 Sleuth + Zipkin + Elasticsearch【Finchley 版】
    Spring MVC 5 + Thymeleaf 基于Java配置和注解配置
    【机器学习】使用gensim 的 doc2vec 实现文本相似度检测
    【机器学习】SKlearn + XGBoost 预测 Titanic 乘客幸存
    【深度学习】keras + tensorflow 实现猫和狗图像分类
    iScroll.js 向上滑动异步加载数据回弹问题
  • 原文地址:https://www.cnblogs.com/kamil/p/5772903.html
Copyright © 2011-2022 走看看