zoukankan html css js c++ java

爬某小说网站代码

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 16:42:02 2019

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 13:44:35 2019

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import re
import time


header ={
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }
    


index_url="http://www.zbjxs.net/4/4589/"
r=requests.get(index_url,headers=header)
r.raise_for_status()
r.encoding=r.apparent_encoding
index_html=r.text

index_soup=BeautifulSoup(index_html,"html.parser")
t7=index_soup.select("li span a")  
links=[]
titles=[]

    
for i in range(len(t7)):
    links.append("http://www.zbjxs.net"+t7[i]["href"])
    titles.append(t7[i].get_text())


f=open("d:/小说.txt","a",encoding="utf-8")


for i in range(len(links)):
    st=time.time()
    r1=requests.get(links[i])
    r1.encoding=r1.apparent_encoding
    html=r1.text 
    soup=BeautifulSoup(html,"html.parser") 
    artitle=titles[i]+"
"+soup.select(".article-con")[0].get_text()
    f.write(artitle) 
    en=time.time()
    runtime=en-st
    print("已经写入第:",i+1,"章，剩余:",len(links)-i-1,"章","用时:",runtime)    
    

f.close()

然后附上一些注释掉的内容，方便复习

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 16:42:02 2019

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 13:44:35 2019

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import re
import time


header ={
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }
    


index_url="http://www.zbjxs.net/4/4589/"
r=requests.get(index_url,headers=header)
r.raise_for_status()
r.encoding=r.apparent_encoding
index_html=r.text

index_soup=BeautifulSoup(index_html,"html.parser")



"""

t0=index_soup.find_all("span")         #精确 22个
t1=index_soup.find_all("a")
t2=index_soup.find_all("a",href=re.compile("html$"))        #不精确24个
t3=index_soup.find_all("a",href=re.compile("d{7}.html$"))   #精确22个

t4=index_soup.find_all(href=re.compile("d{7}.html$"))     #精确22

t5=index_soup.find_all({'href':'re.compile("d{7}")'})


#中文正则   ([u4e00-u9fa5]{2,4})   2-4汉字

#([u4e00-u9fa5]{2,4})  
 
t6=index_soup.find_all("a",string=re.compile('[u4e00-u9fa5]{2-4}'))
"""
t7=index_soup.select("li span a")      #真机吧好用 精确好用 22



"""
#下面每两个为一个功能注解

#用标签名查找
c0=index_soup.select("a") 
#找到所有a标签

c1=index_soup.select("li") 
#找到所有li标签

#类名查找
d0=index_soup.select(".home") 

d1=index_soup.select(".line") 



d2=index_soup.select("[class~=line]") 
#d2和d0等价，select只能接受class 我测试其他参数不可以

"""
"""
通过id获得标签：

soup.select("#link1") #通过设置参数为id来获取该id对应的tag
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("a#link2")  #这里区别于上一个单纯的使用id，又增添了tag属性，使查找更加具体
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
1
2
3
4
5
通过设置select函数的参数为列表，来获取tags。只要匹配列表中的任意一个则就可以捕获。

soup.select(“#link1,#link2”) #捕获id为link1或link2的标签
# [<a class=”sister” href=”http://example.com/elsie” id=”link1”>Elsie</a>, 
# <a class=”sister” href=”http://example.com/lacie” id=”link2”>Lacie</a>]
--------------------- 
作者：SuPhoebe 
来源：CSDN 
原文：https://blog.csdn.net/u013007900/article/details/54728408 
版权声明：本文为博主原创文章，转载请附上博文链接！

这些有用但是本文没有id这个属性，
可以理解的是bs4为了class和id这两个常用的属性，专门自订的一些功能

"""

#下面介绍本文用到的href属性 这个比较通用

"""
按照标签是否存在某个属性来获取：

soup.select('a[href]') #获取a标签中具有href属性的标签
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
1
2
3
4
通过某个标签的具体某个属性值来查找tags：

soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
1
2
3
4
5
6
7
8
9
10
11
12
13
这里需要解释一下： 
soup.select(‘a[href^=”http://example.com/”]’)意思是查找href属性值是以”http://example.com/“值为开头的标签，可以查看博客介绍。 
soup.select(‘a[href$=”tillie”]’)意思是查找href属性值是以tillie为结尾的标签。 
soup.select(‘a[href*=”.com/el”]’)意思是查找href属性值中存在字符串”.com/el”的标签，所以只有href=”http://example.com/elsie”一个匹配。

查询符合查询条件的第一个标签：

soup.select_one(".sister") #只查询符合条件的第一个tag
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
--------------------- 
作者：SuPhoebe 
来源：CSDN 
原文：https://blog.csdn.net/u013007900/article/details/54728408 
版权声明：本文为博主原创文章，转载请附上博文链接！


"""
#e=index_soup.select('a[href]') #获取a标签中具有href属性的标签


#下面采用t7

links=[]
titles=[]

    
for i in range(len(t7)):
    links.append("http://www.zbjxs.net"+t7[i]["href"])
    titles.append(t7[i].get_text())


f=open("d:/丝袜合集.txt","a",encoding="utf-8")


for i in range(len(links)):
    st=time.time()
    r1=requests.get(links[i])
    r1.encoding=r1.apparent_encoding
    html=r1.text 
    soup=BeautifulSoup(html,"html.parser") 
    artitle=titles[i]+"
"+soup.select(".article-con")[0].get_text()
    f.write(artitle) 
    en=time.time()
    runtime=en-st
    print("已经写入第:",i+1,"章，剩余:",len(links)-1-i,"章","用时:",runtime)    
    

f.close()

"""
titles=[]

    titles.append(t7[i].get_text())
"""




"""

 r1=requests.get(links[i])
    r1.encoding=r1.apparent_encoding
    html=r1.text    
    soup=BeautifulSoup(html,"html.parser")
    #txt=soup.find_all(string=)
    #txt=titles[i]+html

"""

header ={
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }

查看全文

相关阅读:
ssh端口转发
 linux git命令安装
 linux git命令
 linux cpio命令
 linux 抓包工具
 js 深拷贝，浅拷贝
 vue $router 打开新窗口
 excel常用操作
 Kafka Topic的增删改查操作
 linux上删除文件名乱码的文件

原文地址：https://www.cnblogs.com/xinqidian/p/10267853.html