代码:
1 #coding=utf-8 2 __author__ = 'zhm' 3 from win32com import client as wc 4 import os 5 import time 6 import random 7 import MySQLdb 8 import re 9 def wordsToHtml(dir): 10 #批量把文件夹的word文档转换成html文件 11 #金山WPS调用,抢先版的用KWPS,正式版WPS 12 word = wc.Dispatch('KWPS.Application') 13 for path, subdirs, files in os.walk(dir): 14 for wordFile in files: 15 wordFullName = os.path.join(path, wordFile) 16 #print "word:" + wordFullName 17 doc = word.Documents.Open(wordFullName) 18 wordFile2 = unicode(wordFile, "gbk") 19 dotIndex = wordFile2.rfind(".") 20 if(dotIndex == -1): 21 print '********************ERROR: 未取得后缀名!' 22 fileSuffix = wordFile2[(dotIndex + 1) : ] 23 if(fileSuffix == "doc" or fileSuffix == "docx"): 24 fileName = wordFile2[ : dotIndex] 25 htmlName = fileName + ".html" 26 htmlFullName = os.path.join(unicode(path, "gbk"), htmlName) 27 # htmlFullName = unicode(path, "gbk") + "\" + htmlName 28 print u'生成了html文件:' + htmlFullName 29 doc.SaveAs(htmlFullName, 8) 30 doc.Close() 31 word.Quit() 32 print "" 33 print "Finished!" 34 def html_add_to_db(dir): 35 #将转换成功的html文件批量插入数据库中。 36 conn = MySQLdb.connect( 37 host='localhost', 38 port=3306, 39 user='root', 40 passwd='root', 41 db='test', 42 charset='utf8' 43 ) 44 cur = conn.cursor() 45 for path, subdirs, files in os.walk(dir): 46 for htmlFile in files: 47 htmlFullName = os.path.join(path, htmlFile) 48 title = os.path.splitext(htmlFile)[0] 49 targetDir = 'D:/files/htmls/' 50 #D:/files为web服务器配置的静态目录 51 sconds = time.time() 52 msconds = sconds * 1000 53 targetFile = os.path.join(targetDir, str(int(msconds))+str(random.randint(100, 10000)) +'.html') 54 htmlFile2 = unicode(htmlFile, "gbk") 55 dotIndex = htmlFile2.rfind(".") 56 if(dotIndex == -1): 57 print '********************ERROR: 未取得后缀名!' 58 fileSuffix = htmlFile2[(dotIndex + 1) : ] 59 if(fileSuffix == "htm" or fileSuffix == "html"): 60 if not os.path.exists(targetDir): 61 os.makedirs(targetDir) 62 htmlFullName = os.path.join(unicode(path, "gbk"), htmlFullName) 63 htFile = open(htmlFullName,'rb') 64 #获取网页内容 65 htmStrCotent = htFile.read() 66 #找出里面的图片 67 img=re.compile(r"""<imgs.*?s?srcs*=s*['|"]?([^s'"]+).*?>""",re.I) 68 m = img.findall(htmStrCotent) 69 for tagContent in m: 70 imgSrc = unicode(tagContent, "gbk") 71 imgSrcFullName = os.path.join(path, imgSrc) 72 #上传图片 73 imgTarget = 'D:/files/images/whzx/' 74 img_sconds = time.time() 75 img_msconds = sconds * 1000 76 targetImgFile = os.path.join(imgTarget, str(int(img_msconds))+str(random.randint(100, 10000)) +'.png') 77 if not os.path.exists(imgTarget): 78 os.makedirs(imgTarget) 79 if not os.path.exists(targetImgFile) or(os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) != os.path.getsize(imgSrcFullName))): 80 tmpImgFile = open(imgSrcFullName,'rb') 81 tmpWriteImgFile = open(targetImgFile, "wb") 82 tmpWriteImgFile.write(tmpImgFile.read()) 83 tmpImgFile.close() 84 tmpWriteImgFile.close() 85 htmStrCotent=htmStrCotent.replace(tagContent,targetImgFile.split(":")[1]) 86 if not os.path.exists(targetFile) or(os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(htmlFullName))): 87 #用iframe包装转换好的html文件。 88 iframeHtml=''' 89 <script type="text/javascript" language="javascript"> 90 function iFrameHeight() { 91 var ifm= document.getElementById("iframepage"); 92 var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument; 93 if(ifm != null && subWeb != null) { 94 ifm.height = subWeb.body.scrollHeight; 95 } 96 } 97 </script> 98 <iframe src='''+targetFile.split(':')[1]+''' 99 marginheight="0" marginwidth="0" frameborder="0" scrolling="no" width="765" height=100% id="iframepage" name="iframepage" onLoad="iFrameHeight()" ></iframe> 100 ''' 101 tmpTargetFile = open(targetFile, "wb") 102 tmpTargetFile.write(htmStrCotent) 103 tmpTargetFile.close() 104 htFile.close() 105 try: 106 # 执行 107 sql = "insert into common_article(title,content) values(%s,%s)" 108 param = (unicode(title, "gbk"),iframeHtml) 109 cur.execute(sql,param) 110 except: 111 print "Error: unable to insert data" 112 cur.close() 113 conn.commit() 114 # 关闭数据库连接 115 conn.close() 116 if __name__ == '__main__': 117 wordsToHtml('d:/word') 118 html_add_to_db('d:/word')