1.环境 安装
pip install pywin32
pip install python-docx
2.关键代码,根据表格标题搜索内容
#根据标题获取同一行的值 def find_row_value(d,sts): lists=[] tlen=len(d.tables) #print("tlen"+str(tlen)) for i in range(0,tlen): #获取表格数 d.tables[i] for j in range(0,len(d.tables[i].rows)): #获取所有行数 for c in range(0,d.tables[i]._column_count): #获取所有的列数 #print("表格行数和列数:"+str(i)+":"+str(j)+":"+str(c)) tests=d.tables[i].cell(j,c).text if tests==sts: #print(tests) #return i,j,c lists.append(d.tables[i].cell(j+1,c).text) else: pass #print("没有找到字符串:"+sts) #print(tests) return lists #根据标题获取同一列的值 def find_cel_value(d,sts): lists=[] tlen=len(d.tables) #print("tlen"+str(tlen)) for i in range(0,tlen): #获取表格数 d.tables[i] for j in range(0,len(d.tables[i].rows)): #获取所有行数 for c in range(0,d.tables[i]._column_count): #获取所有的列数 #print("表格行数和列数:"+str(i)+":"+str(j)+":"+str(c)) tests=d.tables[i].cell(j,c).text if tests==sts: #print(tests) #return i,j,c lists.append(d.tables[i].cell(j,c+1).text) else: pass #print("没有找到字符串:"+sts) #print(tests) return lists
3.使用代码如下:
#coding:utf-8 import os import win32com import win32api from win32com.client import Dispatch, constants from docx import Document def parse_docx(f,workbook,num): d = Document(f) tabs=d.tables #获取name name=find_row_value(d,"网站名称") print(name) url=find_row_value(d,"url") print(url) fx_name=find_cel_value(d,"名称") print(type(fx_name[0])) for i in range(0,len(fx_name)): xuhao=num-1 xuhao=str(xuhao) fx_name_str=fx_name[i] fx_ms_str=fx_ms[i] write_excel(workbook,num,name[0],xuhao,url[0],fx_name_str) num=num+1 def write_excel(workbook,i_row,name,xuhao,url,fx_name1): try: first_sheet=workbook.Worksheets(1) first_sheet.Cells(i_row,1).value=name first_sheet.Cells(i_row,2).value=xuhao first_sheet.Cells(i_row,3).value=url first_sheet.Cells(i_row,4).value=fx_name1 print('成功写入:'+name+" 的信息","这是第"+str(i_row)+"个") workbook.Save() except TypeError as e: print(e) #获取hang的值 def find_row_value(d,sts): lists=[] tlen=len(d.tables) for i in range(0,tlen): #获取表格数 d.tables[i] for j in range(0,len(d.tables[i].rows)): #获取所有行数 for c in range(0,d.tables[i]._column_count): #获取所有的列数 #print("表格行数和列数:"+str(i)+":"+str(j)+":"+str(c)) tests=d.tables[i].cell(j,c).text if tests==sts: lists.append(d.tables[i].cell(j+1,c).text) else: pass return lists #获取列的值 def find_cel_value(d,sts): lists=[] tlen=len(d.tables) for i in range(0,tlen): #获取表格数 d.tables[i] for j in range(0,len(d.tables[i].rows)): #获取所有行数 for c in range(0,d.tables[i]._column_count): #获取所有的列数 #print("表格行数和列数:"+str(i)+":"+str(j)+":"+str(c)) tests=d.tables[i].cell(j,c).text if tests==sts: lists.append(d.tables[i].cell(j,c+1).text) else: pass return lists ''' 上述函数主要实现文件的读取 ''' if __name__ == "__main__": w = win32com.client.Dispatch('Word.Application') #excel excel = win32com.client.Dispatch('Excel.Application') workbook=excel.Workbooks.open('D:\test\2.xlsx') excel.Visible=False #获取从第几行开始写入 rember_sheet=workbook.Worksheets(1) for rember_ever in range(3,1000): Value=rember_sheet.Cells(rember_ever,2).value if(Value==None): break i=rember_ever print("从上次的第",i,"行录入 ") # 遍历文件 PATH = "D:\test" # windows文件路径 doc="1.docx" parse_docx(PATH+'\'+doc,workbook,i) #关闭进程 excel.Quit() w.Quit()
参考链接:
https://blog.csdn.net/qq_34475777/article/details/62055523
https://blog.csdn.net/qq_34475777/article/details/77586663