zoukankan      html  css  js  c++  java
  • Python XPath抓取小说《三国演义》 《三》 多线程简单实例

    增加多线程抓取数据,增加url判断,若数据已抓取,不在重复抓取  (可参考URL管理器)

    需要再添加上队列,否则全开

    from lxml import etree
    import requests
    import time
    import os
    import random
    import urllib3
    from multiprocessing import Pool
    import _thread
    import threading
    
    
    def getHeaders():
    
        #随机获取一个headers
    
        user_agents = ['Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
                       'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                       'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
                       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
                       'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
                       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'
                       ]
        headers = {'User-Agent': random.choice(user_agents),'Connection':'close'}
        return headers
    
    """
    request请求头
    """
    def getRequestHtml(target):
        req = ""
        try:
    
            req = requests.get(url = target,headers = getHeaders(),verify=False,proxies=None)
            req.encoding = "gb2312"
            requests.adapters.DEFAULT_RETRIES = 5
            urllib3.disable_warnings()
            html = req.text
            return html
        except requests.exceptions.ConnectionError:
            req.status_code = "Connection refused"
    
    
    """
    获取章节列表和地址
    """
    def  getContents(target,filePath):
    
        html = getRequestHtml(target)
        bookdata = etree.HTML(html)
        table_list = bookdata.xpath('//table[9]//tr[1]//td[2]//table[4]//tr[1]//td[1]//table[1]//a')
        return table_list
    
    """
    获取小说内容
    """
    def getContent(filePath, title,target):
        html = getRequestHtml(target)
        bookdata = etree.HTML(html)
        table_list = bookdata.xpath('//table[5]//tr[1]//td[2]//text()')
    
        saveData(filePath, title, table_list)
    
    
    """
    将小说内容写入到文件
    """
    
    
    def saveData(filepath, name, text):
    
        isExists = os.path.exists(filepath)
    
        if not isExists:
            os.makedirs(filepath)
    
        url = filepath+name+".txt"
        with open(url, mode="w", encoding="UTF-8") as f:
            f.writelines(text)
            f.write('
    
    ')
    
    
    
    class myThread(threading.Thread):
        def __init__(self,filePath,title,url):
            threading.Thread.__init__(self)
            self.filePath = filePath
            self.title = title
            self.url = url
        def run(self):
    
            getContent(self.filePath, self.title, self.url)
    
    
    if __name__ == '__main__':
        #三国演义 目录地址
        target = "https://www.kanunu8.com/files/old/2011/2447.html"
        filePath = "D:\小说\三国演义\"
        #获取目录列表和地址列表
        title_list = getContents(target,filePath)
    
        t_start = time.time()
        threadlist = []
        for t in title_list:
            title = t.text
            url = "https://www.kanunu8.com/files/old/2011/"+t.get('href')
            print(title,url)
    
            #先判断是否已经抓取过了该页面
            isEx = os.path.isfile(filePath+title+".txt")
            if not isEx:
                try:
                    thread1 = myThread(filePath, title, url)
                    thread1.setDaemon(True)  # 设置守护线程,父线程会等待子线程执行完后再退出
                    thread1.start()
                    threadlist.append(thread1)
                except:
                    print("无法启动线程")
            else:
                print("该文件已经存在 不需要再次抓取")
    
        for tt in threadlist:
            tt.join()
    
        t_end = time.time()
    
        print('抓取本书耗时= %s' % (t_end - t_start))
    

      

  • 相关阅读:
    利用python将表格中的汉字转化为拼音
    vi中批量加注释
    Xtrabackup
    mydumper下载安装
    Adobe Acrobat Pro DC破解
    InnoDB关键特性之double write
    聚集索引与非聚集索引
    has the wrong structure
    初学者如何理解网络协议
    电脑重装系统之后,删除之前的系统
  • 原文地址:https://www.cnblogs.com/dangzhengtao/p/12218897.html
Copyright © 2011-2022 走看看