zoukankan      html  css  js  c++  java
  • <爬虫>多线程爬取

    一、线程回顾

    import time
    import threading
    
    '''一、一个主线程'''
    # def sing():
    #     for i in range(1,6):
    #         print('come baby 跟我一起 嗨 嗨 嗨 !!!')
    #         time.sleep(1)
    #
    # def dance():
    #     for i in range(1,6):
    #         print('恰恰 肚皮 钢管舞 哈哈哈哈哈 ...... ')
    #         time.sleep(1)
    
    # def main():
    #     sing()
    #     dance()
    
    # if __name__ == '__main__':
    #     main()
    
    '''二、面向过程创建线程:一个主线程,两个子线程'''
    # def sing(a):
    #     for i in range(1,6):
    #         print('当前线程:%s ...come %s 跟我一起 嗨 嗨 嗨 !!!' %(threading.current_thread().name,a))
    #         time.sleep(1)
    
    # def dance(a):
    #     for i in range(1,6):
    #         print('当前线程:%s ... 恰恰 肚皮 钢管舞 %s你要哪一种 ' %(threading.current_thread().name,a))
    #         time.sleep(1)
    
    # def main():
    #     print('...联欢晚会现在开始...')
    #     #创建唱歌线程
    #     a = '悟空'
    #     t_sing = threading.Thread(target=sing,name='唱歌',args=(a,))
    #
    #     # 创建跳舞线程
    #     t_dance = threading.Thread(target=dance, name='跳舞',args=(a,))
    #
    #     #启动线程
    #     t_sing.start()
    #     t_dance.start()
    #
    #     #让主线程等待子线程执行完毕
    #     t_sing.join()
    #     t_dance.join()
    #
    #     print('晚会结束,各回各家')
    # if __name__ == '__main__':
    #     main()
    
    '''三、面向对象创建线程'''
    
    #写一个类,继承threading.Thread
    class SingThread(threading.Thread):
        def __init__(self,name,a):
            super().__init__()
            self.name = name
            self.a = a
    
        def run(self):
            print("线程名:%s  参数:%s" %(self.name,self.a))
            for i in range(1, 6):
                print('爱江山更爱美人...')
                time.sleep(1)
    
    class DanceThread(threading.Thread):
        def __init__(self, name, a):
            super().__init__()
            self.name = name
            self.a = a
    
        def run(self):
            print("线程名:%s  参数:%s" % (self.name, self.a))
            for i in range(1, 6):
                print('蹦擦擦,蹦擦擦...')
                time.sleep(1)
    
    def main():
        #创建线程
        t_sing = SingThread('','八戒')
        t_dance = DanceThread('','悟能')
    
        #启动线程
        t_sing.start()
        t_dance.start()
    
        #让主线程等待子线程执行完毕
        t_sing.join()
        t_dance.join()
    
    if __name__ == '__main__':
        main()

    二、队列

    from queue import Queue
    
    #创建队列
    q = Queue(5)     #5个位子
    print(q.empty())   #判断是否为空
    
    #存入数据
    q.put('浓眉哥')
    q.put('勒布朗')
    q.put('丹尼*格林')
    q.put('库兹马')
    q.put('麦基')
    print(q.full())    #判断是否满
    print(q.qsize())    #返回队列大小
    # q.put('波普',False)    #如果队列满了,直接报错
    # q.put('波普',True,3)   #如果队列满了,等待3秒还没有空位,报错
    
    #获取数据:先进先出
    print(q.get())
    print(q.get())
    print(q.get())
    print(q.get())
    print(q.get())
    # q.get('波普',False)    #如果队列为空,直接报错
    # q.get('波普',True,3)   #如果队列为空,等待3秒还是空,报错

    三、多线程爬虫

    import time
    import threading
    from queue import Queue
    import requests
    from lxml import  etree
    import json
    
    #存放采集线程
    crawl_thread_list = []
    #存放解析线程
    parse_thread_list = []
    
    def create_queue():
        #创建页码队列
        page_queue = Queue()
        for page in range(1,6):
            page_queue.put(page)
    
        # 创建内容队列
        data_queue = Queue()
    
        return  page_queue,data_queue
    
    class CrawlThread(threading.Thread):
        def __init__(self,name,page_queue,data_queue):
            super(CrawlThread,self).__init__()
            self.name = name
            self.page_queue = page_queue
            self.data_queue = data_queue
            self.url = 'http://www.fanjian.net/jiantu-{}'
            self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
        def run(self):
            print('%s启动......' %self.name)
            while 1:
                if self.page_queue.empty():
                    break
                #从队列取出页码
                page = self.page_queue.get()
                #拼接url
                url = self.url.format(page)
                #发送请求,拿到响应
                r = requests.get(url=url,headers=self.headers)
                #将响应内容放到data_queue
                self.data_queue.put(r.text)
                break
            print('%s结束......' % self.name)
    
    class ParseThread(threading.Thread):
        def __init__(self,name,data_queue,fp,lock):
            super(ParseThread, self).__init__()
            self.name = name
            self.data_queue = data_queue
            self.fp = fp
            self.lock = lock
    
        def parse_content(self,data):
            tree = etree.HTML(data)
            '''先查找所有的li,再从li下查找图片标题和src'''
            li_list = tree.xpath('//ul[@class="cont-list"]/li')
            items = []
            for l in li_list:
                # 获取图片标题
                img_title = l.xpath('//h2/a/text()')[0]
                #获取图片url
                img_url = tree.xpath('//div[@class="cont-list-main"]/p/img/@data-src')[0]
                item = {'标题',img_title,
                        '链接',img_url}
                items.append(item)
    
            #写入文件
            self.lock.acquire()      #上锁
            for item in items:
                self.fp.write(str(item))
            self.lock.release()      #解锁
    
        def run(self):
            while 1:
    
                print('%s启动......' % self.name)
                #从data_queue中取出一页数据
                data = self.data_queue.get()
                #解析内容
                self.parse_content(data)
    
    
    def create_crawl_thread(page_queue,data_queue):
        crawl_name = ['采集1号','采集2号','采集3号']
        for name in crawl_name:
            #创建子线程
            t_crawl = CrawlThread(name,page_queue,data_queue)
            #保存到列表
            crawl_thread_list.append(t_crawl)
    
    def create_parse_thread(data_queue,fp,lock):
        parse_name = ['解析1号', '解析2号', '解析3号']
        for name in parse_name:
            # 创建子线程
            t_parse = ParseThread(name,data_queue,fp,lock)
            # 保存到列表
            parse_thread_list.append(t_parse)
    
    def main():
        # 创建队列
        page_queue,data_queue = create_queue()
    
        #打开一个文件
        fp = open('jiantu.txt','a',encoding='utf8')
    
        #创建锁
        lock = threading.Lock()
    
        #创建采集线程
        create_crawl_thread(page_queue,data_queue)
        #创建解析线程
        create_parse_thread(data_queue,fp,lock)
    
        # 启动采集线程
        for t in crawl_thread_list:
            t.start()
        # 启动解析线程
        for t in parse_thread_list:
            t.start()
    
        # 让主线程等待子线程执行完毕
        for t in crawl_thread_list:
            t.join()
        for t in parse_thread_list:
            t.join()
    
        #关闭文件
        fp.close()
    
        print('主线程执行完毕!')
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    leetcode 18 4Sum
    leetcode 71 Simplify Path
    leetcode 10 Regular Expression Matching
    leetcode 30 Substring with Concatenation of All Words
    leetcode 355 Design Twitte
    leetcode LRU Cache
    leetcode 3Sum
    leetcode Letter Combinations of a Phone Number
    leetcode Remove Nth Node From End of List
    leetcode Valid Parentheses
  • 原文地址:https://www.cnblogs.com/Finance-IT-gao/p/11146517.html
Copyright © 2011-2022 走看看