zoukankan      html  css  js  c++  java
  • 爬某小说网站代码

    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 16:42:02 2019
    
    @author: Administrator
    """
    
    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 13:44:35 2019
    
    @author: Administrator
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    
    
    header ={
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip,deflate",
        "Accept-Language": "zh-CN,zh;q=0.8"
        }
        
    
    
    index_url="http://www.zbjxs.net/4/4589/"
    r=requests.get(index_url,headers=header)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    index_html=r.text
    
    index_soup=BeautifulSoup(index_html,"html.parser")
    t7=index_soup.select("li span a")   links
    =[] titles=[] for i in range(len(t7)): links.append("http://www.zbjxs.net"+t7[i]["href"]) titles.append(t7[i].get_text()) f=open("d:/小说.txt","a",encoding="utf-8") for i in range(len(links)): st=time.time() r1=requests.get(links[i]) r1.encoding=r1.apparent_encoding html=r1.text soup=BeautifulSoup(html,"html.parser") artitle=titles[i]+" "+soup.select(".article-con")[0].get_text() f.write(artitle) en=time.time() runtime=en-st print("已经写入第:",i+1,"章,剩余:",len(links)-i-1,"","用时:",runtime) f.close()

    然后附上一些注释掉的内容,方便复习

    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 16:42:02 2019
    
    @author: Administrator
    """
    
    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 13:44:35 2019
    
    @author: Administrator
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    
    
    header ={
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip,deflate",
        "Accept-Language": "zh-CN,zh;q=0.8"
        }
        
    
    
    index_url="http://www.zbjxs.net/4/4589/"
    r=requests.get(index_url,headers=header)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    index_html=r.text
    
    index_soup=BeautifulSoup(index_html,"html.parser")
    
    
    
    """
    
    t0=index_soup.find_all("span")         #精确 22个
    t1=index_soup.find_all("a")
    t2=index_soup.find_all("a",href=re.compile("html$"))        #不精确24个
    t3=index_soup.find_all("a",href=re.compile("d{7}.html$"))   #精确22个
    
    t4=index_soup.find_all(href=re.compile("d{7}.html$"))     #精确22
    
    t5=index_soup.find_all({'href':'re.compile("d{7}")'})
    
    
    #中文正则   ([u4e00-u9fa5]{2,4})   2-4汉字
    
    #([u4e00-u9fa5]{2,4})  
     
    t6=index_soup.find_all("a",string=re.compile('[u4e00-u9fa5]{2-4}'))
    """
    t7=index_soup.select("li span a")      #真机吧好用 精确好用 22
    
    
    
    """
    #下面每两个为一个功能注解
    
    #用标签名查找
    c0=index_soup.select("a") 
    #找到所有a标签
    
    c1=index_soup.select("li") 
    #找到所有li标签
    
    #类名查找
    d0=index_soup.select(".home") 
    
    d1=index_soup.select(".line") 
    
    
    
    d2=index_soup.select("[class~=line]") 
    #d2和d0等价,select只能接受class 我测试其他参数不可以
    
    """
    """
    通过id获得标签:
    
    soup.select("#link1") #通过设置参数为id来获取该id对应的tag
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
    
    soup.select("a#link2")  #这里区别于上一个单纯的使用id,又增添了tag属性,使查找更加具体
    # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
    1
    2
    3
    4
    5
    通过设置select函数的参数为列表,来获取tags。只要匹配列表中的任意一个则就可以捕获。
    
    soup.select(“#link1,#link2”) #捕获id为link1或link2的标签
    # [<a class=”sister” href=”http://example.com/elsie” id=”link1”>Elsie</a>, 
    # <a class=”sister” href=”http://example.com/lacie” id=”link2”>Lacie</a>]
    --------------------- 
    作者:SuPhoebe 
    来源:CSDN 
    原文:https://blog.csdn.net/u013007900/article/details/54728408 
    版权声明:本文为博主原创文章,转载请附上博文链接!
    
    这些有用但是本文没有id这个属性,
    可以理解的是bs4为了class和id这两个常用的属性,专门自订的一些功能
    
    """
    
    #下面介绍本文用到的href属性 这个比较通用
    
    """
    按照标签是否存在某个属性来获取:
    
    soup.select('a[href]') #获取a标签中具有href属性的标签
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    1
    2
    3
    4
    通过某个标签的具体某个属性值来查找tags:
    
    soup.select('a[href="http://example.com/elsie"]')
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
    
    soup.select('a[href^="http://example.com/"]')
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    soup.select('a[href$="tillie"]')
    # [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    soup.select('a[href*=".com/el"]')
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    这里需要解释一下: 
    soup.select(‘a[href^=”http://example.com/”]’)意思是查找href属性值是以”http://example.com/“值为开头的标签,可以查看博客介绍。 
    soup.select(‘a[href$=”tillie”]’)意思是查找href属性值是以tillie为结尾的标签。 
    soup.select(‘a[href*=”.com/el”]’)意思是查找href属性值中存在字符串”.com/el”的标签,所以只有href=”http://example.com/elsie”一个匹配。
    
    查询符合查询条件的第一个标签:
    
    soup.select_one(".sister") #只查询符合条件的第一个tag
    # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
    --------------------- 
    作者:SuPhoebe 
    来源:CSDN 
    原文:https://blog.csdn.net/u013007900/article/details/54728408 
    版权声明:本文为博主原创文章,转载请附上博文链接!
    
    
    """
    #e=index_soup.select('a[href]') #获取a标签中具有href属性的标签
    
    
    #下面采用t7
    
    links=[]
    titles=[]
    
        
    for i in range(len(t7)):
        links.append("http://www.zbjxs.net"+t7[i]["href"])
        titles.append(t7[i].get_text())
    
    
    f=open("d:/丝袜合集.txt","a",encoding="utf-8")
    
    
    for i in range(len(links)):
        st=time.time()
        r1=requests.get(links[i])
        r1.encoding=r1.apparent_encoding
        html=r1.text 
        soup=BeautifulSoup(html,"html.parser") 
        artitle=titles[i]+"
    "+soup.select(".article-con")[0].get_text()
        f.write(artitle) 
        en=time.time()
        runtime=en-st
        print("已经写入第:",i+1,"章,剩余:",len(links)-1-i,"","用时:",runtime)    
        
    
    f.close()
    
    """
    titles=[]
    
        titles.append(t7[i].get_text())
    """
    
    
    
    
    """
    
     r1=requests.get(links[i])
        r1.encoding=r1.apparent_encoding
        html=r1.text    
        soup=BeautifulSoup(html,"html.parser")
        #txt=soup.find_all(string=)
        #txt=titles[i]+html
    
    """
    header ={
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip,deflate",
        "Accept-Language": "zh-CN,zh;q=0.8"
        }
        

  • 相关阅读:
    ASP.NET Core2利用MassTransit集成RabbitMQ
    ASP.NET Core2集成Office Online Server(OWAS)实现办公文档的在线预览与编辑(支持wordexcelpptpdf等格式)
    ASP.NET Core2利用Jwt技术在服务端实现对客户端的身份认证
    net core System.Drawing is not supported on this platform.
    小程序开发之维护access_token
    net core 100个案例
    msgSystem.Drawing is not supported on this platform【netcore】
    wpf,前端动画demo,鱼眼效果
    自定义控件,重写 TextBox 实例
    TextBox输入法控制,进入输入框则启用或禁用输入法(ime),禁用后只能输入英文
  • 原文地址:https://www.cnblogs.com/xinqidian/p/10267853.html
Copyright © 2011-2022 走看看