zoukankan      html  css  js  c++  java
  • 爬虫批量下载全站小说并自动保存

    目的是下载一个网站所有分类的小说,并且按照分类自动创建目录、根据小说名保存为txt文件。

    一、抓取思路:

      我的思路是百度一个小说网站,找到一本小说的章节页,使用requests、BeautifulSoup测试是否能

    正常下载。如果能正常下载,进行下一步。

    二、操作顺序:

         1.导入模块,指定网页请求头:

    from bs4 import BeautifulSoup
    import requests
    import time
    import os
    import random
    
    my_headers = [
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
    
        ]
    
    headers = {
        'User-Agent':random.choice(my_headers)#随机选取模拟浏览器
    }

      2.获取一本小说的章节页,并写入指定路径:

    #url = 'http://www.fhxiaoshuo.com/read/3/3414/6127874.shtml'
        data = requests.get(url,headers=headers)
        time.sleep(2)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')#注意.encode('ISO-8859-1').decode('GB18030')的用法
    text = soup.select('div.zhangjieTXT')[0].text title2 = soup.select('div.zhangjieming > h1')[0].text 
    ls
    = [] for i in text:
      
    if i in "' ','ads_wz_2();','xa0',' ',' ','“',' ','■', '◆', 'n', '■', '◆', 'h', 'u', '■', '◆', 'b', ',', '∧', 'n', '♀', '※',":
        
    continue
      ls.append(i)
    text
    =''.join(ls) print('正在下载{}'.format(title2))
    with open(
    '.\books\' + 'title1' + '.txt','ab+') as f:
      f.write((title2
    + ' ').encode()) #写入标题
      f.write(text.encode())#写入正文
      f.write(
    ' '.encode())#写入换行

      3.获取一本小说的全部章节链接:

    def get_urls(url,fenlei_title):
        #url = 'http://www.fhxiaoshuo.com/read/3/3414/'
        data = requests.get(url,headers=headers)
        time.sleep(2)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        title1 = soup.select('div#maininfo > div > h1')[0].text
        if not os.path.exists('.\books\' + fenlei_title + '\'+ title1):
            os.mkdir('.\books\' + fenlei_title + '\'+ title1)
        links = soup.select('div#list > dl')
        print("正在下载{}".format(title1))
        #ls = []
        for i in links:
            data = i.select('dd > a')
            time.sleep(2)
            for m in data:
                url = m.get('href')
                #ls.append(ls)
                get_text(url,title1,fenlei_title)

     4.获取一个分类,比如武侠类的全部小说:

    def get_list(url,fenlei_title):
        #url = 'http://www.fhxiaoshuo.com/sort/5/1/'
        data = requests.get(url,headers=headers)
        time.sleep(1)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        links = soup.select('div#alist')
        for i in links:
            data = i.select('div.info > div.title > h2 > a')
            for m in data:
                url = m.get('href')
                time.sleep(3)
                title = m.text
                get_urls(url,fenlei_title)

      5.获取首页全部分类链接:

    def get_fenlei():
        url = 'http://www.fhxiaoshuo.com/'
        data = requests.get(url,headers=headers)
        time.sleep(0.5)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        links = soup.select('div.nav1 > ul')
        for i in links:
            data = i.select('li > a')
            for m in data:
                url = m.get('href')
                time.sleep(1)
                fenlei_title = m.text
                if not os.path.exists('.\books\' + fenlei_title):
                    os.mkdir('.\books\' + fenlei_title)
                    get_list(url, fenlei_title)

    三、全部代码如下(使用time.sleep()保障网页请求):

    #!/usr/bin/env python
    # -*- coding:utf-8 -*- 
    #Author: ss
    
    
    from bs4 import BeautifulSoup
    import requests
    import time
    import os
    import random
    
    my_headers = [
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
    
        ]
    
    headers = {
        'User-Agent':random.choice(my_headers)
    }
    
    def get_text(url,title1,fenlei_title):
        #url = 'http://www.fhxiaoshuo.com/read/3/3414/6127874.shtml'
        data = requests.get(url,headers=headers)
        time.sleep(2)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        text = soup.select('div.zhangjieTXT')[0].text
        title2 = soup.select('div.zhangjieming > h1')[0].text
        ls = []
        for i in text:
            if i in "'
    ','ads_wz_2();','xa0','	','
    ','“','	','■', '◆', 'n', '■', '◆', 'h', 'u', '■', '◆', 'b', ',', '∧', 'n', '♀', '※',":
                continue
            ls.append(i)
        text =''.join(ls)
        print('正在下载{}'.format(title2))
        with open('.\books\' + fenlei_title + '\' +title1 +'\' + title1 + '.txt','ab+') as f:
            f.write((title2 + '
    ').encode())
            f.write(text.encode())
            f.write('
    
    '.encode())
    
    
    def get_urls(url,fenlei_title):
        #url = 'http://www.fhxiaoshuo.com/read/3/3414/'
        data = requests.get(url,headers=headers)
        time.sleep(2)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        title1 = soup.select('div#maininfo > div > h1')[0].text
        if not os.path.exists('.\books\' + fenlei_title + '\'+ title1):
            os.mkdir('.\books\' + fenlei_title + '\'+ title1)
        links = soup.select('div#list > dl')
        print("正在下载{}".format(title1))
        #ls = []
        for i in links:
            data = i.select('dd > a')
            time.sleep(2)
            for m in data:
                url = m.get('href')
                #ls.append(ls)
                get_text(url,title1,fenlei_title)
    
    def get_list(url,fenlei_title):
        #url = 'http://www.fhxiaoshuo.com/sort/5/1/'
        data = requests.get(url,headers=headers)
        time.sleep(1)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        links = soup.select('div#alist')
        for i in links:
            data = i.select('div.info > div.title > h2 > a')
            for m in data:
                url = m.get('href')
                time.sleep(3)
                title = m.text
                get_urls(url,fenlei_title)
    
    def get_fenlei():
        url = 'http://www.fhxiaoshuo.com/'
        data = requests.get(url,headers=headers)
        time.sleep(0.5)
        soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
        links = soup.select('div.nav1 > ul')
        for i in links:
            data = i.select('li > a')
            for m in data:
                url = m.get('href')
                time.sleep(1)
                fenlei_title = m.text
                if not os.path.exists('.\books\' + fenlei_title):
                    os.mkdir('.\books\' + fenlei_title)
                    get_list(url, fenlei_title)
    
    get_fenlei()

    asd

  • 相关阅读:
    软考之操作系统
    牛腩javascript(二)之正则表达式
    牛腩javascript(一)
    软考之算法
    软考之数据结构
    软考之路之刷屏开始
    XML中的几种比较
    北大青鸟ASP.NET之总结篇
    Webassembly 学习2 -- Js 与C 数据交互
    nginx-proxy_redirect
  • 原文地址:https://www.cnblogs.com/ssxsy/p/9040741.html
Copyright © 2011-2022 走看看