zoukankan      html  css  js  c++  java
  • 爬虫批量自动下载小说

    下载排行榜的所有小说

     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*- 
     3 #Author: ss
     4 
     5 from bs4 import BeautifulSoup
     6 import requests
     7 import time
     8 import os
     9 
    10 headers = {
    11     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
    12 }
    13 
    14 def get_text(url,title1):
    15     #url = 'https://www.xxbiquge.com/0_36/8840634.html'
    16     data = requests.get(url,headers=headers)
    17     time.sleep(0.5)
    18     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml')
    19     text = soup.select('div.content_read > div > div#content')[0].text
    20     title2 = soup.select('div.content_read > div > div.bookname > h1')[0].text
    21     ls = []
    22     for i in text:
    23         if i in "' 
    ','xa0','readx();'":
    24             continue
    25         else:
    26             ls.append(i)
    27     text = ''.join(ls)
    28     with open('.\books\' + title1 + '.txt','ab+') as f:
    29         f.write((title1 + '
    ').encode())
    30         #f.write('
    '.encode())
    31         f.write(text.encode())
    32         f.write('
    
    '.encode())
    33     print('正在下载{}'.format(title2))
    34 
    35 def get_one_links(url):
    36     #url = 'https://www.xxbiquge.com/0_36/'
    37     data = requests.get(url, headers=headers)
    38     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'), 'lxml')
    39     links = soup.select('div#list > dl > dd')
    40     title = soup.select('div#maininfo > div#info > h1')[0].text
    41     print('开始下载{}'.format(title))
    42     for i in links:
    43         data = i.select('a')
    44         for m in data:
    45             url = 'https://www.xxbiquge.com' + m.get('href')
    46             get_text(url,title)
    47 
    48 def get_all():
    49     url = 'https://www.xxbiquge.com/xbqgph.html'
    50     data = requests.get(url,headers=headers)
    51     time.sleep(0.5)
    52     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml')
    53     links = soup.select('div.novelslist2 > ul > li')
    54     for i in links:
    55         data = i.select('span.s2 > a')
    56         for m in data:
    57             url = 'https://www.xxbiquge.com' + data[0].get('href')
    58             get_one_links(url)
    59 
    60 if not os.path.exists('.\books'):
    61     os.mkdir('.\books')
    62 get_all()
  • 相关阅读:
    判断当前是否运行于Design Mode
    从Setting.settings到Resource.resx
    构造函数强制使用new
    getFullYear 方法
    前端开发中经常使用到的20个正则表达式。
    函数调用模式
    javascript中return的作用
    javascript数组遍历for与for in区别详解
    闭包
    js split 的用法和定义 js split分割字符串成数组的实例代码
  • 原文地址:https://www.cnblogs.com/ssxsy/p/9036635.html
Copyright © 2011-2022 走看看