zoukankan html css js c++ java

爬虫批量自动下载小说

下载排行榜的所有小说

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*- 
 3 #Author: ss
 4 
 5 from bs4 import BeautifulSoup
 6 import requests
 7 import time
 8 import os
 9 
10 headers = {
11     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
12 }
13 
14 def get_text(url,title1):
15     #url = 'https://www.xxbiquge.com/0_36/8840634.html'
16     data = requests.get(url,headers=headers)
17     time.sleep(0.5)
18     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml')
19     text = soup.select('div.content_read > div > div#content')[0].text
20     title2 = soup.select('div.content_read > div > div.bookname > h1')[0].text
21     ls = []
22     for i in text:
23         if i in "' 
','xa0','readx();'":
24             continue
25         else:
26             ls.append(i)
27     text = ''.join(ls)
28     with open('.\books\' + title1 + '.txt','ab+') as f:
29         f.write((title1 + '
').encode())
30         #f.write('
'.encode())
31         f.write(text.encode())
32         f.write('

'.encode())
33     print('正在下载{}'.format(title2))
34 
35 def get_one_links(url):
36     #url = 'https://www.xxbiquge.com/0_36/'
37     data = requests.get(url, headers=headers)
38     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'), 'lxml')
39     links = soup.select('div#list > dl > dd')
40     title = soup.select('div#maininfo > div#info > h1')[0].text
41     print('开始下载{}'.format(title))
42     for i in links:
43         data = i.select('a')
44         for m in data:
45             url = 'https://www.xxbiquge.com' + m.get('href')
46             get_text(url,title)
47 
48 def get_all():
49     url = 'https://www.xxbiquge.com/xbqgph.html'
50     data = requests.get(url,headers=headers)
51     time.sleep(0.5)
52     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml')
53     links = soup.select('div.novelslist2 > ul > li')
54     for i in links:
55         data = i.select('span.s2 > a')
56         for m in data:
57             url = 'https://www.xxbiquge.com' + data[0].get('href')
58             get_one_links(url)
59 
60 if not os.path.exists('.\books'):
61     os.mkdir('.\books')
62 get_all()

查看全文

相关阅读:
remote: You are not allowed to push code to this project
Ubuntu 查看本机的ip
git跟踪远程分支，查看本地分支追踪和远程分支的关系
 edgedb 基本试用
 influxdb 全家桶运行
 Introducing Outflux: a smart way out of InfluxDB
使用outflux 导入influxdb 的数据到timescaledb
edgedb 强大的对象关系数据库
 Announcing the Operate Preview Release: Monitoring and Managing Cross-Microservice Workflows
goaccess iis w3c 自定义log 格式参考

原文地址：https://www.cnblogs.com/ssxsy/p/9036635.html