zoukankan      html  css  js  c++  java
  • 90分钟掌握Python多线程爬虫(全程实战)

    https://edu.csdn.net/learn/20379?spm=1002.2001.3001.4157

    #encoding: utf-8
    
    import requests
    from bs4 import BeautifulSoup
    from urllib import request
    import os
    import threading
    
    # 首先先要对请求的身份进行伪装。
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    # 用来存储所有的页面的url
    PAGE_URLS = []
    IMG_URLS = []
    gLock = threading.Lock()
    
    # 生产者:专门用来获取表情包的url链接。
    # 消费者:专门从表情包的url链接中下载图片
    # 全局变量:就是一个列表,这个列表存储了许多的表情包的链接。
    
    def producer():
        while True:
            gLock.acquire()
            if len(PAGE_URLS) == 0:
                gLock.release()
                break
            page_url = PAGE_URLS.pop()
            gLock.release()
            response = requests.get(page_url, headers=headers)
            text = response.text
            soup = BeautifulSoup(text, 'lxml')
            img_list = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"})
            for img in img_list:
                # 有些img_url没有http前缀
                img_url = img['data-original']
                IMG_URLS.append(img_url)
    
    def consumer():
        while True:
            gLock.acquire()
            if len(IMG_URLS) == 0 and len(PAGE_URLS) == 0:
                gLock.release()
                break
            if len(IMG_URLS) > 0:
                img_url = IMG_URLS.pop()
            else:
                img_url = ''
            gLock.release()
            # https://ws2.sinaimg.cn/bmiddle/9150e4e5gy1g0saavmreuj20250250sh.jpg
            # ['https:','','ws2.sinaimg.cn','bmiddle','9150e4e5gy1g0saavmreuj20250250sh.jpg']
            # windows: D:PublicCourseclass2019_03_06qb
            # Mac/Linux/Unix:/root/srv
            if img_url:
                try:
                    filename = img_url.split("/")[-1]
                    fullpath = os.path.join("images", filename)
                    request.urlretrieve(img_url, fullpath)
                    print("%s下载完成" % filename)
                except:
                    print("="*30)
                    print(img_url)
                    print("=" * 30)
    
    def main():
        # 1. 先获取所有页面的url
        for x in range(1,100):
            page_url = "https://www.doutula.com/photo/list/?page="+str(x)
            PAGE_URLS.append(page_url)
    
        # 五个生产者线程
        for x in range(5):
            th = threading.Thread(target=producer)
            th.start()
    
        # 五个消费者线程
        for x in range(5):
            th = threading.Thread(target=consumer)
            th.start()
    
    if __name__ == '__main__':
        main()

    多进程优化:

    #encoding: utf-8
    import time
    import threading
    import random
    
    gMoney = 0
    # 只要想要在多线程中操作全局变量,那么就需要在操作的时候进行上锁
    gLock = threading.Lock()
    
    def greet(index):
        print("helloworld-%d"%index)
        time.sleep(0.5)
    
    
    def line_run():
        for x in range(5):
            greet(x)
    
    def thread_run():
        for x in range(5):
            th = threading.Thread(target=greet,args=[x])
            th.start()
    
    def produter():
        global gMoney
        while True:
            money = random.randint(0,100)
            gLock.acquire()
            gMoney += money
            gLock.release()
            print("%s生产者生产了%s元钱,剩余%s元钱"%(threading.current_thread(),money,gMoney))
            time.sleep(0.5)
    
    def consumer():
        global gMoney
        while True:
            money = random.randint(0,100)
            gLock.acquire()
            if gMoney >= money:
                gMoney -= money
                print("%s消费者消费了%s元钱,剩余%s元钱"%(threading.current_thread(),money,gMoney))
            else:
                print("%s消费者想消费%s元钱,但是余额不足!剩余%s元钱!"%(threading.current_thread(),money,gMoney))
            gLock.release()
            time.sleep(0.5)
    
    
    
    if __name__ == '__main__':
        # line_run()
        # thread_run()
        for x in range(5):
            th = threading.Thread(target=produter)
            th.start()
    
        for x in range(5):
            th = threading.Thread(target=consumer)
            th.start()
  • 相关阅读:
    python-历史
    10-函数命名空间,作用域,嵌套,闭包
    centos7 搭建dns服务器
    centos7 搭建dhcp服务器
    Nginx 启用 gzip 压缩
    Eclipse 个人手册
    Nginx 命令
    定时任务
    系统设计
    根据 xsd 生成 jaxb java 类
  • 原文地址:https://www.cnblogs.com/yszr/p/14981655.html
Copyright © 2011-2022 走看看