zoukankan      html  css  js  c++  java
  • 第10课-队列、使用多线程和队列的爬虫案例

    1、队列代码示例

    import threading
    import time
    from queue import Queue
    '''
    Queue是线程安全的队列
    '''
    def set_data(q):
        index = 0
        while True:
            q.put(index)
            index += 1
            time.sleep(3)
    
    def get_data(q):
        while True:
            print(q.get())
    
    if __name__ == '__main__':
        q = Queue(4)
        t1 = threading.Thread(target=set_data,args=[q])
        t2 = threading.Thread(target=get_data,args=[q])
        t1.start()
        t2.start()
        q = Queue(1)
        q.put(1)
        q.get(timeout=1)
        print(q.empty())
        print(q.full(timeout=1))
        print(q.qsize())
    

    2、斗图爬虫实战

    import requests
    import threading
    from queue import Queue
    from lxml import etree
    from urllib import  request
    
    g_flag = True
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    
    #爬取图片地址
    def put_picture_link(q):
        global g_flag
        for i in range(1,11):
            text = requests.get(url="http://www.doutula.com/article/list/?page={}".format(i), headers=HEADERS).text
            html = etree.HTML(text)
            imgs_elements = html.xpath(
                '//div[@class="col-sm-9 center-wrap"]/a/div[@class="random_article"]/div/img [@class!="gif"]')
            for img_element in imgs_elements:
                image_link = img_element.xpath("@data-original")[0]
                q.put(image_link)
        g_flag = False
    
    
    #下载图片
    def download(q):
        index = 1
        while g_flag or q.qsize()>0:
            img_link = q.get(timeout=1)
            result = requests.get(url=img_link)
            if result.status_code == 200:
                my_picture = result.content
                append = img_link.split(".")[-1]
                with open("c://pictures/{}.{}".format(index, append), "wb") as fp:
                    fp.write(my_picture)
                #文件下载
                # request.urlretrieve(url=img_link,filename="c://pictures/{}.{}".format(index, append))
                index += 1
    
    
    if __name__ == '__main__':
        q = Queue(10)  #初始化队列
        t1 = threading.Thread(target=put_picture_link,args=[q])
        t1.start()
        t2 = threading.Thread(target=download,args=[q])
        t2.start()
        print("主线程执行完毕!!!")
    

    3、百思不得姐爬虫实战

    """百思不得姐爬虫实战"""
    import threading
    from lxml import etree
    import requests
    from queue import Queue
    import csv
    
    g_Lock = threading.Lock()
    g_flag = True
    
    DOMAIN = "http://www.budejie.com/"
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    #生产者
    class Producer(threading.Thread):
        def __init__(self,queue_url,queue_content):
            super(Producer,self).__init__()
            self.__queue_url = queue_url
            self.__queue_content = queue_content
        def run(self):
            global g_flag
            # count = 1
    
            while self.__queue_url.qsize()>0:
                url = self.__queue_url.get()
                text = requests.get(url= url,headers = HEADERS).text
                html = etree.HTML(text)
                contents = html.xpath('//div[@class="g-mn"]//div[@class="j-r-list"]//ul//div[@class="j-r-list-c-desc"]/a')
    
                for c in contents:
                    content = c.xpath("text()")[0].replace(r"u200b","")
                    link = DOMAIN + c.xpath("@href")[0]
                    content_dict = {}
                    content_dict["段子"] = content
                    content_dict["链接"] = link
                    self.__queue_content.put(content_dict)
                # print("第{}个页面请求成功".format(count))
                # count += 1
    
            g_flag = False
    
            print("-----------------------所有请求已完成---------------")
    
    #消费者
    class Consumer(threading.Thread):
        def __init__(self,queue_content,writer,i):
            super(Consumer,self).__init__()
            self.__queue_content = queue_content
            self.__writer = writer
            self.__i = i
    
        def run(self):
            print("----dddddddddddddd---")
            while True:
                if self.__queue_content.qsize()>0 or g_flag :
                    try:
                        content_dict = self.__queue_content.get(timeout=1)
                        g_Lock.acquire()
                        self.__writer.writerow(content_dict)
                        g_Lock.release()
                    except Exception as e:
                        print("队列为空{}".format(e))
                else:
                    break
                print("线程{}".format(self.__i),g_flag, self.__queue_content.qsize())
    
    
    
    if __name__ == '__main__':
        q_url = Queue(100)
        q_content = Queue(100)
        for i in range(1,25):
            q_url.put("http://www.budejie.com/text/{}".format(i))
    
        header = ["段子","链接"]
        fp = open("text.csv","w",encoding="utf-8",newline="")
        writer = csv.DictWriter(fp,header)
        writer.writeheader()
        for i in range(0,1):
            c = Consumer(q_content,writer,i)
            c.start()
        p = Producer(q_url,q_content)
        p.start()
    

      

  • 相关阅读:
    BZOJ 1391: [Ceoi2008]order
    BZOJ 4504: K个串
    2019 年百度之星·程序设计大赛
    POJ 2398 Toy Storage (二分 叉积)
    POJ 2318 TOYS (二分 叉积)
    HDU 6697 Closest Pair of Segments (计算几何 暴力)
    HDU 6695 Welcome Party (贪心)
    HDU 6693 Valentine's Day (概率)
    HDU 6590 Code (判断凸包相交)
    POJ 3805 Separate Points (判断凸包相交)
  • 原文地址:https://www.cnblogs.com/win0211/p/12144549.html
Copyright © 2011-2022 走看看