import re import requests from bs4 import BeautifulSoup url = 'http://www.biquge6.com/19_19336/' r = requests.get(url) b = BeautifulSoup(r.content.decode('gbk')) h = b.find_all(href = re.compile('/19_19336/')) # 正则匹配属性值带有/104_104216/的href标签,并返回正则模式对象h list_len = len(h) # 剔除掉最新12章节 print('开始下载:') i = 1 for each in h: print('正在下载第' + str(i) + '章,共' + str(list_len) + '章') url1 = url + each.get('href')[10:] # ,获取其中一个超链接地址第12位后的链接地址 re = requests.get(url1) # 每章节完整链接地址 bs = BeautifulSoup(re.content.decode('gbk')) # 获取章节数据 t = bs.find_all('h1')[0].text[1:] # find_all获取章节文章标题,[0].text[1:]截取标题内容 content = bs.find_all(id = 'content')[0].text # 数据清洗,清除html的多余标签 content = content.replace('xa0'*8,' ').replace(' ', '').replace(' ', ' ') content = t + ' ' + content +' ' # 将标题和内容整合 with open('C:\UsersDELLDesktop大数据应用开发圣墟.doc', 'a', encoding='utf-8') as f: f.write(content) i += 1 print('下载完成!')
===左手举个栗子,右手举个锤子===