zoukankan      html  css  js  c++  java
  • 用selenium 自动爬取某一本小说章节及其内容,并存入数据库中

     1 from selenium import webdriver
     2 import pymysql
     3 from selenium.webdriver.support.ui import WebDriverWait     # 等待
     4 from selenium.webdriver.support import expected_conditions as ec # 等待条件
     5 from selenium.webdriver.common.by import By
     6 import html
     7 import _thread
     8 from selenium.webdriver.chrome.options import Options
     9 
    10 def ceil(x, y):
    11     if x % y == 0:              # 相除后为整数
    12         return int(x / y)
    13     else:                       # 相除有小数
    14         return int(x / y) + 1
    15 
    16 
    17 
    18 
    19 # 创建一个浏览器
    20 chrome_options = Options()
    21 chrome_options.add_argument('--headless')
    22 dr = webdriver.Chrome(chrome_options=chrome_options)
    23 
    24 # 设置访问的网站
    25 dr.get('https://doupocangqiong1.com/1/list_piaotian/')
    26 
    27 # 获取所有的a标签
    28 a = dr.find_elements_by_css_selector('.dirlist > li > a')
    29 
    30 # 连接数据库
    31 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')
    32 # 获取游标
    33 cursor = db.cursor()
    34 
    35 for i in a:
    36     name = i.text
    37     href = i.get_attribute('href')
    38     sql = "INSERT INTO novel (name,href,content) VALUES ('%s','%s','%s')"%(name,href,'')
    39     cursor.execute(sql)         # 使用execute方法执行SQL语句
    40     db.commit()
    41 dr.close()          # 关闭浏览器
    42 
    43 
    44 
    45 def line(lineName, start, count):
    46     dr = webdriver.Chrome(chrome_options=chrome_options)         # 创建一个浏览器
    47     # 连接数据库
    48     db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')
    49     # 获取游标
    50     cursor = db.cursor()
    51 
    52     sql = "SELECT id,href FROM novel LIMIT %s, %s"%(start, count)
    53     cursor.execute(sql)  # 使用execute方法执行SQL语句
    54     data = cursor.fetchall()  # 使用 fetchall() 方法获取所有数据
    55     for i in data:
    56         dr.get(i[1])
    57         # 放置等待
    58         WebDriverWait(dr, 5, 0.1).until_not(ec.text_to_be_present_in_element((By.CSS_SELECTOR, '#chaptercontent'),
    59                                                                             U'正在转码,请稍后......'))  # 等待dr浏览器10秒钟,每0.1秒钟问一次
    60         content = html.escape(dr.find_element_by_css_selector('#chaptercontent').text)
    61         # escape()将特殊字符转为特殊的编码格式,unescape()将编码格式转回特殊字符
    62         sql = "UPDATE novel SET content = '%s' WHERE id = %s" % (content, i[0])
    63         cursor.execute(sql)  # 使用execute方法执行SQL语句
    64         db.commit()
    65         print(lineName, '完成了', i[0], '的采集')
    66     dr.close()          # 关闭窗口
    67     dr.quit()           # 关闭浏览器
    68     cursor.close()
    69     db.close()
    70     print(lineName, '完成了采集')
    71 
    72 
    73 def productLine(func, total, lineCount):
    74     every = ceil(total[0][0], lineCount)
    75     print('every', every)
    76     for i in range(lineCount):
    77         print('-------------', i)
    78         print(_thread.start_new_thread(func, ('line-' + str(i) + '', i * every, every)))
    79 
    80 
    81 try:
    82     sql = 'SELECT COUNT(*) FROM novel'
    83     cursor.execute(sql)  # 使用execute方法执行SQL语句
    84     total = cursor.fetchall()  # 使用 fetchall() 方法获取所有数据
    85     print(total)
    86 
    87     productLine(line, total, 5)
    88 
    89 except:
    90     print ("Error: unable to start thread")
    91 
    92 
    93 while 1:
    94    pass
    View Code
  • 相关阅读:
    adb命令之adb install
    GNU make and Makefile
    Makefile经典教程(掌握这些足够)
    Android.mk简介
    PhoneFactory.getDefaultPhone must be called from Looper thread
    Android源码目录结构
    软件部通用技术类网站名录
    β测试
    α测试
    白盒测试
  • 原文地址:https://www.cnblogs.com/SakuraYuanYuan/p/11051343.html
Copyright © 2011-2022 走看看