zoukankan      html  css  js  c++  java
  • 利用python进行多线程爬虫

    import threading
    import time
    import requests
    import json
    from queue import Queue
    from lxml import etree
    
    
    class CrawlThread(threading.Thread):
        '''doc string for crawl thread'''
    
        def __init__(self, name, page_queue, data_queue):
            super(CrawlThread, self).__init__()
            self.name = name
            self.page_queue = page_queue
            self.data_queue = data_queue
            self.url = 'http://www.ifanjian.net/latest-{}'
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            }
    
        def run(self):
            print('%s ------- 线程启动' % self.name)
            while True:
                # 判断采集线程何时结束
                if self.page_queue.empty():
                    break
                # 从队列中取出页面
                page = self.page_queue.get()
                print("===采集===开始第%s页数据" % page)
                # 拼接url,发送请求
                url = self.url.format(page)
                time.sleep(1)
                r = requests.get(url, headers=self.headers)
                # 将响应内容存放到data_queue
                self.data_queue.put({
                    "index": str(page),
                    "text": r.text
                })
                print("===采集===结束第%s页数据" % page)
    
            print('%s ------- 线程结束' % self.name)
    
    
    class ParserThread(threading.Thread):
        '''doc string for parse thread'''
    
        def __init__(self, name, data_queue, fp, lock):
            super(ParserThread, self).__init__()
            self.name = name
            self.data_queue = data_queue
            self.fp = fp
            self.lock = lock
    
        def run(self):
            print('%s ------- 线程启动' % self.name)
            while True:
                try:
                    # 从data_queue中取出数据
                    data = self.data_queue.get(True, 10)
                    print("===解析===开始第%s页数据" % data["index"])
                    # 解析内容
                    self.parse_content(data['text'])
                    print("===解析===开始第%s页数据" % data["index"])
                except Exception:
                    break
    
            print('%s ------- 线程结束' % self.name)
    
        # 解析内容
        def parse_content(self, data):
            tree = etree.HTML(data)
            # 先找出所有的li,然后再从li中找到所有的标题和图片
            li_list = tree.xpath("//ul[@class='cont-list']/li")
            items = []
            for li in li_list:
                # 获取标题
                title = li.xpath(".//h2[@class='cont-list-title']/a/text()")[0]
                # 获取图片
                img_url = li.xpath(
                    ".//div[contains(@class,'cont-list-main')]//img/@data-src")
                item = {
                    '标题': title,
                    '图片链接': img_url
                }
                items.append(item)
    
            # 写到文件中
            self.lock.acquire()
            self.fp.write(json.dumps(items, ensure_ascii=False) + '
    ')
            self.lock.release()
    
    
    # 用来存放采集线程
    g_crawl_list = []
    # 用来存放解析线程
    g_parser_list = []
    
    
    def create_crawl_thread(page_queue, data_queue):
        crawl_names = ['采集线程1', '采集线程2', '采集线程3']
        for name in crawl_names:
            tcrawl = CrawlThread(name, page_queue, data_queue)
            g_crawl_list.append(tcrawl)
    
    
    def create_parser_thread(data_queue, fp, lock):
        parse_names = ['解析线程1', '解析线程2', '解析线程3']
        for name in parse_names:
            tparse = ParserThread(name, data_queue, fp, lock)
            g_parser_list.append(tparse)
    
    
    def create_queue():
        page_queue = Queue()
        for page in range(1, 10):
            page_queue.put(page)
    
        data_queue = Queue()
    
        return page_queue, data_queue
    
    
    def main():
        # 创建队列
        page_queue, data_queue = create_queue()
        # 打开文件
        fp = open('jian.json', 'a', encoding='utf-8')
        # 创建锁
        lock = threading.Lock()
        # 创建采集线程
        create_crawl_thread(page_queue, data_queue)
        # 创建解析线程
        create_parser_thread(data_queue, fp, lock)
    
        # 启动所有采集线程
        for tcrawl in g_crawl_list:
            tcrawl.start()
        # 启动所有解析线程
        for tparser in g_parser_list:
            tparser.start()
    
        # 主线程等待子线程结束
        for tcrawl in g_crawl_list:
            tcrawl.join()
        for tparser in g_parser_list:
            tparser.join()
        # 关闭文件
        fp.close()
        print("主线程和子线程全部结束.....")
    
    
    if __name__ == "__main__":
        main()
  • 相关阅读:
    设计模式实战应用之五:工厂方法模式
    Codeforces445A_DZY Loves Chessboard(预处理)
    void f(int(&p)[3]){} 和void f(int(*p)[3]){}的差别
    《linux 内核全然剖析》 mktime.c
    Java中对象、对象引用、堆、栈、值传递以及引用传递的详解
    android 仿ios开关控件
    ViewDragHelper实战 自己打造Drawerlayout
    [javase学习笔记]-8.5 statickeyword的使用场景
    玩转图片Base64编码
    Android stuido viewpagerindicator的使用
  • 原文地址:https://www.cnblogs.com/KruceCoder/p/12076673.html
Copyright © 2011-2022 走看看