#-*- codeing = utf-8 -*- #@Time :2021/6/21 16:51 #@Author :Xxg #@Site : #@File :作业归档完善版.py #@Software :PyCharm import random import requests import pymysql from lxml import etree import docx headers={ "User-Agent": "" } url = '' reponse = requests.get(url, headers=headers) # reponse html = etree.HTML(reponse.text) # print(html) date = html.xpath('//div[@class="dayTitle"]/a/text()') name = html.xpath('//div[@class="postTitle"]/a/span/text()') zhaiyao = html.xpath('//div[@class="postCon"]/div[@class="c_b_p_desc"]/text()') # 链接 yueduquanwen = html.xpath('//div[@class="postCon"]/div[@class="c_b_p_desc"]/a/@href') for i in range(len(yueduquanwen)): url1 = yueduquanwen[i] # url1 = "https://www.cnblogs.com/sakura-xxg/category/1990334.html" reponse1 = requests.get(url1, headers=headers) # reponse html_son = etree.HTML(reponse1.text) title = html_son.xpath('//div[@class="post"]/h1[@class="postTitle"]/a/span/text()') print(title) content = html_son.xpath('//div[@class="blogpost-body blogpost-body-html"]/p/text()') print(content) date = html_son.xpath('//div[@class="postDesc"]/span[@id="post-date"]/text()') print(date) # 创建docx对象 file = docx.Document() file.add_paragraph(date) for j in range(len(content)): file.add_paragraph(content[j]) file.save("D:\"+title[0]+".docx") # for j in range(len(content)): # file.add_paragraphy(content[j]) # date_son = html.xpath('//div[@class="dayTitle"]/a/text()') # name_son = html.xpath('//div[@class="postTitle"]/a/span/text()') # zhaiyao_son = html.xpath('//div[@class="postCon"]/div[@class="c_b_p_desc"]/text()') # print(date_son) # print(zhaiyao_son) print(yueduquanwen) # print(date[0]) # print(name[0].replace(" ","").replace(" ","")) # print(zhaiyao[0].replace(" ","")) # print(zhaiyao[0]) # 保存成word # for n in range(len(date)): # file = docx.Document() # file.add_paragraph(date[n]) # file.add_paragraph(zhaiyao[2*n].replace(" ","")) # # file.save("F:\word\"+name[n].replace(" ","").replace(" ","")+".docx") # print(date[n]) # print(zhaiyao[2*n])