# -*- coding: utf-8 -*- """ Created on Mon Jan 14 16:42:02 2019 @author: Administrator """ # -*- coding: utf-8 -*- """ Created on Mon Jan 14 13:44:35 2019 @author: Administrator """ import requests from bs4 import BeautifulSoup import re import time header ={ "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.8" } index_url="http://www.zbjxs.net/4/4589/" r=requests.get(index_url,headers=header) r.raise_for_status() r.encoding=r.apparent_encoding index_html=r.text index_soup=BeautifulSoup(index_html,"html.parser")
t7=index_soup.select("li span a") links=[] titles=[] for i in range(len(t7)): links.append("http://www.zbjxs.net"+t7[i]["href"]) titles.append(t7[i].get_text()) f=open("d:/小说.txt","a",encoding="utf-8") for i in range(len(links)): st=time.time() r1=requests.get(links[i]) r1.encoding=r1.apparent_encoding html=r1.text soup=BeautifulSoup(html,"html.parser") artitle=titles[i]+" "+soup.select(".article-con")[0].get_text() f.write(artitle) en=time.time() runtime=en-st print("已经写入第:",i+1,"章,剩余:",len(links)-i-1,"章","用时:",runtime) f.close()
然后附上一些注释掉的内容,方便复习
# -*- coding: utf-8 -*- """ Created on Mon Jan 14 16:42:02 2019 @author: Administrator """ # -*- coding: utf-8 -*- """ Created on Mon Jan 14 13:44:35 2019 @author: Administrator """ import requests from bs4 import BeautifulSoup import re import time header ={ "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.8" } index_url="http://www.zbjxs.net/4/4589/" r=requests.get(index_url,headers=header) r.raise_for_status() r.encoding=r.apparent_encoding index_html=r.text index_soup=BeautifulSoup(index_html,"html.parser") """ t0=index_soup.find_all("span") #精确 22个 t1=index_soup.find_all("a") t2=index_soup.find_all("a",href=re.compile("html$")) #不精确24个 t3=index_soup.find_all("a",href=re.compile("d{7}.html$")) #精确22个 t4=index_soup.find_all(href=re.compile("d{7}.html$")) #精确22 t5=index_soup.find_all({'href':'re.compile("d{7}")'}) #中文正则 ([u4e00-u9fa5]{2,4}) 2-4汉字 #([u4e00-u9fa5]{2,4}) t6=index_soup.find_all("a",string=re.compile('[u4e00-u9fa5]{2-4}')) """ t7=index_soup.select("li span a") #真机吧好用 精确好用 22 """ #下面每两个为一个功能注解 #用标签名查找 c0=index_soup.select("a") #找到所有a标签 c1=index_soup.select("li") #找到所有li标签 #类名查找 d0=index_soup.select(".home") d1=index_soup.select(".line") d2=index_soup.select("[class~=line]") #d2和d0等价,select只能接受class 我测试其他参数不可以 """ """ 通过id获得标签: soup.select("#link1") #通过设置参数为id来获取该id对应的tag # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] soup.select("a#link2") #这里区别于上一个单纯的使用id,又增添了tag属性,使查找更加具体 # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] 1 2 3 4 5 通过设置select函数的参数为列表,来获取tags。只要匹配列表中的任意一个则就可以捕获。 soup.select(“#link1,#link2”) #捕获id为link1或link2的标签 # [<a class=”sister” href=”http://example.com/elsie” id=”link1”>Elsie</a>, # <a class=”sister” href=”http://example.com/lacie” id=”link2”>Lacie</a>] --------------------- 作者:SuPhoebe 来源:CSDN 原文:https://blog.csdn.net/u013007900/article/details/54728408 版权声明:本文为博主原创文章,转载请附上博文链接! 这些有用但是本文没有id这个属性, 可以理解的是bs4为了class和id这两个常用的属性,专门自订的一些功能 """ #下面介绍本文用到的href属性 这个比较通用 """ 按照标签是否存在某个属性来获取: soup.select('a[href]') #获取a标签中具有href属性的标签 # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] 1 2 3 4 通过某个标签的具体某个属性值来查找tags: soup.select('a[href="http://example.com/elsie"]') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] soup.select('a[href^="http://example.com/"]') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] soup.select('a[href$="tillie"]') # [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] soup.select('a[href*=".com/el"]') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 1 2 3 4 5 6 7 8 9 10 11 12 13 这里需要解释一下: soup.select(‘a[href^=”http://example.com/”]’)意思是查找href属性值是以”http://example.com/“值为开头的标签,可以查看博客介绍。 soup.select(‘a[href$=”tillie”]’)意思是查找href属性值是以tillie为结尾的标签。 soup.select(‘a[href*=”.com/el”]’)意思是查找href属性值中存在字符串”.com/el”的标签,所以只有href=”http://example.com/elsie”一个匹配。 查询符合查询条件的第一个标签: soup.select_one(".sister") #只查询符合条件的第一个tag # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> --------------------- 作者:SuPhoebe 来源:CSDN 原文:https://blog.csdn.net/u013007900/article/details/54728408 版权声明:本文为博主原创文章,转载请附上博文链接! """ #e=index_soup.select('a[href]') #获取a标签中具有href属性的标签 #下面采用t7 links=[] titles=[] for i in range(len(t7)): links.append("http://www.zbjxs.net"+t7[i]["href"]) titles.append(t7[i].get_text()) f=open("d:/丝袜合集.txt","a",encoding="utf-8") for i in range(len(links)): st=time.time() r1=requests.get(links[i]) r1.encoding=r1.apparent_encoding html=r1.text soup=BeautifulSoup(html,"html.parser") artitle=titles[i]+" "+soup.select(".article-con")[0].get_text() f.write(artitle) en=time.time() runtime=en-st print("已经写入第:",i+1,"章,剩余:",len(links)-1-i,"章","用时:",runtime) f.close() """ titles=[] titles.append(t7[i].get_text()) """ """ r1=requests.get(links[i]) r1.encoding=r1.apparent_encoding html=r1.text soup=BeautifulSoup(html,"html.parser") #txt=soup.find_all(string=) #txt=titles[i]+html """
header ={ "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.8" }