zoukankan      html  css  js  c++  java
  • python获取小说内容

    在使用前要安装python的第3方库,BeautifulSoup,pymysql

     代码里面用了mysql数据库

     代码里面获取小说网站地址是:http://www.kbiquge.com

     mysql里面的表结构:

    CREATE TABLE `story` (
      `id` varchar(200) NOT NULL DEFAULT '',
      `name` varchar(200) DEFAULT NULL COMMENT '名称',
      `start` varchar(20) DEFAULT NULL COMMENT '状态',
      `end_start` varchar(200) DEFAULT NULL COMMENT '更新时间',
      `author` varchar(200) DEFAULT NULL COMMENT '作者',
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    
    
    CREATE TABLE `chapter` (
      `chapter_id` varchar(200) NOT NULL DEFAULT '0' COMMENT '章节ID',
      `story_id` varchar(200) DEFAULT NULL COMMENT '小说ID',
      `chapter_name` varchar(200) DEFAULT NULL COMMENT '章节名称',
      `chapter_content` mediumtext COMMENT '内容',
      `chapter_href` varchar(2000) DEFAULT NULL COMMENT 'URL',
      PRIMARY KEY (`chapter_id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;

    以下是源码:

     1 #coding=utf-8
     2 import pymysql
     3 import time
     4 import datetime
     5 import uuid
     6 
     7 
     8 from urllib import request
     9 from bs4 import BeautifulSoup
    10 
    11 
    12 #数据存入章节表中 批量提价数据, usersvalues[] 包含chapter_id,story_id,chapter_name,chapter_content,chapter_href
    13 def Write_info(usersvalues):
    14     db = pymysql.connect("localhost","root","123456","python" )
    15     cursor = db.cursor()
    16     try:
    17         sql = "INSERT  INTO chapter(chapter_id,story_id,chapter_name,chapter_content,chapter_href) 
    18           VALUES(%s,%s,%s,%s,%s)"
    19         # 执行sql语句 批量插入数据
    20         cursor.executemany(sql, usersvalues)
    21         db.commit()
    22     except ZeroDivisionError:
    23         print ("Error: unable to fetch data")
    24         db.rollback()
    25     db.close()
    26 
    27 #小说名称 story_name
    28 def Story_name(story_name):
    29     db = pymysql.connect("localhost","root","123456","python" )
    30     uuids=str(uuid.uuid1()).replace('-','')
    31     cursor = db.cursor()
    32     try:
    33         cursor.execute("select id from story  where name='"+story_name+"'")
    34         fname=""
    35         results = cursor.fetchall()
    36         for row in results:
    37             fname= row[0]
    38         if cursor.rowcount!=1:
    39             sql = """INSERT INTO STORY(id,name, start, end_start,author) 
    40              VALUES ('"""+uuids+"""', '"""+story_name+"""', '1', '1', 'wangyh')"""
    41             cursor.execute(sql)
    42             db.commit()
    43             return uuids
    44         else:
    45             return fname
    46     except ZeroDivisionError:
    47         print ("Error: unable to fetch data")
    48         db.rollback()
    49     db.close()
    50 
    51 
    52 if __name__ == '__main__':
    53     # 目录页
    54     url_xs='http://www.kbiquge.com'
    55     url = url_xs+'/86_86683/'
    56     head = {}
    57     head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
    58     req = request.Request(url, headers = head)
    59     response = request.urlopen(req)
    60     html = response.read()
    61     # 解析目录页
    62     soup = BeautifulSoup(html, 'lxml')
    63     #小说名称 id="info"
    64     story_name = soup.find('div', id = 'info').find("h1").text
    65     #查询是否存入 story表中 story_id 小说ID
    66     story_id= Story_name(story_name)
    67     print("story_id:"+story_id)
    68     # find_next找到第二个<div> 小说目录
    69     soup_texts = soup.find('div', id = 'list')
    70     usersvalues=[]
    71     # 遍历ol的子节点,打印出章节标题和对应的链接地址
    72     for link in soup_texts.dl.children:
    73         if link != '
    ':
    74             print('start')
    75             list_tmp=link.find_all('a')
    76             for a in list_tmp:
    77                 #0.5秒
    78                 time.sleep(0.5)
    79                 download_url = url_xs+a.get('href')
    80                 download_req = request.Request(download_url, headers = head)
    81                 download_response = request.urlopen(download_req)
    82                 download_html = download_response.read()
    83                 download_soup = BeautifulSoup(download_html, 'lxml')
    84                 download_soup_texts = download_soup.find('div', id = 'content')
    85                 download_soup_texts = download_soup_texts.text
    86                 download_soup_texts= download_soup_texts.replace(u'xa0', u' ')
    87                 uuids="w"+str(int(round(time.time() * 1000)))
    88                 data=(uuids,story_id,a.text,download_soup_texts,download_url)
    89                 usersvalues.append(data)
    90     Write_info(usersvalues)
    View Code
  • 相关阅读:
    HTML页面保存为图片
    一些chrome调试
    hooks使用的一些注意点
    React(v16.8) Hooks 简析
    基于 React.js 和 Node.js 的 SSR 实现方案
    移动端垂直居中对齐
    python XML ElementTree的增删改查
    python实现XML解析的三种方法
    git的用法
    Appium获取元素的方式
  • 原文地址:https://www.cnblogs.com/heyy520/p/9835303.html
Copyright © 2011-2022 走看看