在使用前要安装python的第3方库,BeautifulSoup,pymysql
代码里面用了mysql数据库
代码里面获取小说网站地址是:http://www.kbiquge.com
mysql里面的表结构:
CREATE TABLE `story` ( `id` varchar(200) NOT NULL DEFAULT '', `name` varchar(200) DEFAULT NULL COMMENT '名称', `start` varchar(20) DEFAULT NULL COMMENT '状态', `end_start` varchar(200) DEFAULT NULL COMMENT '更新时间', `author` varchar(200) DEFAULT NULL COMMENT '作者', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `chapter` ( `chapter_id` varchar(200) NOT NULL DEFAULT '0' COMMENT '章节ID', `story_id` varchar(200) DEFAULT NULL COMMENT '小说ID', `chapter_name` varchar(200) DEFAULT NULL COMMENT '章节名称', `chapter_content` mediumtext COMMENT '内容', `chapter_href` varchar(2000) DEFAULT NULL COMMENT 'URL', PRIMARY KEY (`chapter_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
以下是源码:
1 #coding=utf-8 2 import pymysql 3 import time 4 import datetime 5 import uuid 6 7 8 from urllib import request 9 from bs4 import BeautifulSoup 10 11 12 #数据存入章节表中 批量提价数据, usersvalues[] 包含chapter_id,story_id,chapter_name,chapter_content,chapter_href 13 def Write_info(usersvalues): 14 db = pymysql.connect("localhost","root","123456","python" ) 15 cursor = db.cursor() 16 try: 17 sql = "INSERT INTO chapter(chapter_id,story_id,chapter_name,chapter_content,chapter_href) 18 VALUES(%s,%s,%s,%s,%s)" 19 # 执行sql语句 批量插入数据 20 cursor.executemany(sql, usersvalues) 21 db.commit() 22 except ZeroDivisionError: 23 print ("Error: unable to fetch data") 24 db.rollback() 25 db.close() 26 27 #小说名称 story_name 28 def Story_name(story_name): 29 db = pymysql.connect("localhost","root","123456","python" ) 30 uuids=str(uuid.uuid1()).replace('-','') 31 cursor = db.cursor() 32 try: 33 cursor.execute("select id from story where name='"+story_name+"'") 34 fname="" 35 results = cursor.fetchall() 36 for row in results: 37 fname= row[0] 38 if cursor.rowcount!=1: 39 sql = """INSERT INTO STORY(id,name, start, end_start,author) 40 VALUES ('"""+uuids+"""', '"""+story_name+"""', '1', '1', 'wangyh')""" 41 cursor.execute(sql) 42 db.commit() 43 return uuids 44 else: 45 return fname 46 except ZeroDivisionError: 47 print ("Error: unable to fetch data") 48 db.rollback() 49 db.close() 50 51 52 if __name__ == '__main__': 53 # 目录页 54 url_xs='http://www.kbiquge.com' 55 url = url_xs+'/86_86683/' 56 head = {} 57 head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' 58 req = request.Request(url, headers = head) 59 response = request.urlopen(req) 60 html = response.read() 61 # 解析目录页 62 soup = BeautifulSoup(html, 'lxml') 63 #小说名称 id="info" 64 story_name = soup.find('div', id = 'info').find("h1").text 65 #查询是否存入 story表中 story_id 小说ID 66 story_id= Story_name(story_name) 67 print("story_id:"+story_id) 68 # find_next找到第二个<div> 小说目录 69 soup_texts = soup.find('div', id = 'list') 70 usersvalues=[] 71 # 遍历ol的子节点,打印出章节标题和对应的链接地址 72 for link in soup_texts.dl.children: 73 if link != ' ': 74 print('start') 75 list_tmp=link.find_all('a') 76 for a in list_tmp: 77 #0.5秒 78 time.sleep(0.5) 79 download_url = url_xs+a.get('href') 80 download_req = request.Request(download_url, headers = head) 81 download_response = request.urlopen(download_req) 82 download_html = download_response.read() 83 download_soup = BeautifulSoup(download_html, 'lxml') 84 download_soup_texts = download_soup.find('div', id = 'content') 85 download_soup_texts = download_soup_texts.text 86 download_soup_texts= download_soup_texts.replace(u'xa0', u' ') 87 uuids="w"+str(int(round(time.time() * 1000))) 88 data=(uuids,story_id,a.text,download_soup_texts,download_url) 89 usersvalues.append(data) 90 Write_info(usersvalues)