zoukankan      html  css  js  c++  java
  • [Python]网络小说爬取、爬虫

    1.源代码

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @File  : HtmlParser.py
    # @Author: 赵路仓
    # @Date  : 2020/3/27
    # @Desc  :
    # @Contact : 398333404@qq.com 
    
    import requests
    from bs4 import BeautifulSoup
    
    # 请求头
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    }
    url = 'https://www.xsbiquge.com/74_74627/'
    count = 0
    
    
    def menu(name):
        url = search(name)
        f = open("test.txt", "w", encoding="utf-8")
        r = requests.get(url, headers=head)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, "html.parser")
        chapter = soup.find("dl")
        for c in chapter:
            try:
                for a in c:
                    url = "https://www.xsbiquge.com" + str(a.attrs['href'])
                    f.write(url + "
    ")
            except:
                print("ss")
        f.close()
    
    
    def single_page(url):
        global count
        r = requests.get(url, headers=head)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, "html.parser")
        h1 = soup.find("h1")
        content = soup.find("div", {"id": "content"})
        all = str(h1.string) + "
    " + str(content).replace("<div id="content">", "    ").replace("<br/><br/><br/><br/>",
                                                                                                    "
    
    ").replace(
            "<br/><br/>", "
    
    ").replace("</div>", "") + "
    "
        count += 1
        print("当前第%d章" % (count))
        # print(all)
        return all
    
    
    def content(path, name):
        menu(name)
        fh = open(path + name + ".txt", "w", encoding="utf-8")
        fh.write(name + "
    
    ")
        fh.close()
        fw = open(path + name + ".txt", "a+", encoding="utf-8")
        f = open("test.txt", "r+", encoding="utf-8")
        for line in f:
            line = line.rstrip("
    ")
            fw.write(single_page(line))
        fw.close()
        f.close()
    
    
    def search(name):
        searchurl = "https://www.xsbiquge.com/search.php?keyword="
        searchurl = searchurl + name
        r = requests.get(searchurl, headers=head)
        soup = BeautifulSoup(r.text, "html.parser")
        title = soup.find_all("a", {"cpos": "title"})
        for t in title:
            print(t.attrs['title'])
            print(t.attrs['href'])
            if t.attrs['title'] == name:
                return t.attrs['href']
    
    
    if __name__ == "__main__":
        # print(single_page("https://www.xsbiquge.com/74_74627/3845841.html"))
        # menu("E:/test.txt",url)
        # content("E:/", "万古大帝")
        # search("斗破苍穹")
        content("E:/", "斗破苍穹")

    爬取结果:

  • 相关阅读:
    Java线程的几种状态
    常用几种Java Web容器
    数据库触发器
    SQL优化及注意事项
    Oracle中rownum和rowid的区别
    数据库及SQL优化
    如何安装使用Impala
    Impala:新一代开源大数据分析引擎
    开源大数据查询分析引擎
    Google Dremel 原理
  • 原文地址:https://www.cnblogs.com/zlc364624/p/12874010.html
Copyright © 2011-2022 走看看