zoukankan      html  css  js  c++  java
  • python学习之小说爬虫

     1 # coding:utf8
     2 from multiprocessing.dummy import Pool as ThreadPool
     3 import multiprocessing
     4 import requests, os, codecs, time
     5 from lxml import etree
     6 
     7 url = 'https://www.biquge5200.cc/79_79883/'  # 要下载的小说章节列表页面url
     8 
     9 
    10 def getsource(url):
    11     try:
    12         s = requests.get(url)
    13     except:
    14         print('访问异常,跳过~!')
    15     else:
    16         s.encoding = 'gbk'
    17         return s.text
    18 
    19 
    20 def getlist(url):
    21     global txtname, txtzz
    22     #解析地址
    23     html = getsource(url)
    24     ehtml = etree.HTML(html)
    25     u = ehtml.xpath('//*[@id="list"]/dl/dd/a/@href')
    26     t = ehtml.xpath('//*[@id="list"]/dl/dd/a/text()')
    27     txtname = ehtml.xpath('//*[@id="info"]/h1/text()')[0].replace('\', '').replace('/', '').replace(':', '').replace(
    28         '*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '')
    29     txtzz = ehtml.xpath('//*[@id="info"]/p[1]/text()')[0].replace('xa0', '')
    30     num = 0
    31     #循环urllist
    32     for i in range(9, len(u)):
    33         urllist.append(u[i] + '|' + t[i] + '|' + str(num))
    34         print(urllist)
    35         print(u[i] + '|' + t[i] + '|' + str(num))
    36         num += 1
    37 
    38 
    39 def downtxt(url):
    40     global downcount
    41     u = url.split('|')[0]
    42     t = url.split('|')[1]
    43     num = url.split('|')[2]
    44     content = ''
    45     while len(content) == 0:
    46         html = getsource(u)
    47         ehtml = etree.HTML(html)
    48         content = ehtml.xpath('string(//*[@id="content"])').replace('    ', '
    ').replace('  ', '
    ').replace(
    49             'xa0', '').replace('ufffd', '').replace('u266a', '').replace('readx;', '')
    50     if os.path.exists(savepath + num + '.txt'):
    51         print(num + '.txt 已经存在!')
    52     else:
    53         with codecs.open(savepath + num + '.txt', 'a')as f:
    54             f.write('
    ' + t + '
    ' + content)
    55         print(t + ' 下载完成!')
    56         downcount += 1
    57 
    58 
    59 time_start = time.time();
    60 downcount = 0
    61 urllist = []
    62 getlist(url)
    63 savepath = os.getcwd() + '\' + txtname + '\'
    64 if os.path.exists(savepath) == False:
    65     os.makedirs(savepath)
    66 pool = ThreadPool(multiprocessing.cpu_count())
    67 results = pool.map(downtxt, urllist)
    68 pool.close()
    69 pool.join()
    70 print('开始合并txt...')
    71 with codecs.open(savepath + txtname + '.txt', 'a')as f:
    72     f.write(txtname)
    73     f.write('
    ')
    74     f.write(txtzz)
    75     f.write('
    ')
    76     for i in range(0, len(urllist)):
    77         with open(savepath + str(i) + '.txt', "r") as fr:
    78             txt = fr.read()
    79             f.write(txt)
    80             f.write('===========================')
    81             fr.close()
    82             os.remove(savepath + str(i) + '.txt')
    83 print('小说合并完成~!')
    84 
    85 print('')
    86 print('*' * 15 + ' 任务完成,结果如下:' + '*' * 15)
    87 print('')
    88 print('<' + txtname + '> 下载完成' + ',获取并下载章节页面:' + str(downcount) + '')
    89 print('')
    90 print('耗时:' + str(time.time() - time_start) + ' s')
    91 print('')
    92 print('*' * 51)
  • 相关阅读:
    javaScript中的find()方法和返回数据的内存指向
    高级函数 filter map reduce 的使用
    for ... in and for ... of 理解
    git 解决冲突问题
    H5内唤醒百度、高德APP
    HTML 5标准中最新引入的template标签介绍
    jquery选择器使用
    ajax封装函数
    常用正则表达式
    JS-----事件、image对象
  • 原文地址:https://www.cnblogs.com/hfct/p/10977974.html
Copyright © 2011-2022 走看看