zoukankan      html  css  js  c++  java
  • 90分钟掌握Python多线程爬虫(全程实战)

    https://edu.csdn.net/learn/20379?spm=1002.2001.3001.4157

    #encoding: utf-8
    
    import requests
    from bs4 import BeautifulSoup
    from urllib import request
    import os
    import threading
    
    # 首先先要对请求的身份进行伪装。
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    # 用来存储所有的页面的url
    PAGE_URLS = []
    IMG_URLS = []
    gLock = threading.Lock()
    
    # 生产者:专门用来获取表情包的url链接。
    # 消费者:专门从表情包的url链接中下载图片
    # 全局变量:就是一个列表,这个列表存储了许多的表情包的链接。
    
    def producer():
        while True:
            gLock.acquire()
            if len(PAGE_URLS) == 0:
                gLock.release()
                break
            page_url = PAGE_URLS.pop()
            gLock.release()
            response = requests.get(page_url, headers=headers)
            text = response.text
            soup = BeautifulSoup(text, 'lxml')
            img_list = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"})
            for img in img_list:
                # 有些img_url没有http前缀
                img_url = img['data-original']
                IMG_URLS.append(img_url)
    
    def consumer():
        while True:
            gLock.acquire()
            if len(IMG_URLS) == 0 and len(PAGE_URLS) == 0:
                gLock.release()
                break
            if len(IMG_URLS) > 0:
                img_url = IMG_URLS.pop()
            else:
                img_url = ''
            gLock.release()
            # https://ws2.sinaimg.cn/bmiddle/9150e4e5gy1g0saavmreuj20250250sh.jpg
            # ['https:','','ws2.sinaimg.cn','bmiddle','9150e4e5gy1g0saavmreuj20250250sh.jpg']
            # windows: D:PublicCourseclass2019_03_06qb
            # Mac/Linux/Unix:/root/srv
            if img_url:
                try:
                    filename = img_url.split("/")[-1]
                    fullpath = os.path.join("images", filename)
                    request.urlretrieve(img_url, fullpath)
                    print("%s下载完成" % filename)
                except:
                    print("="*30)
                    print(img_url)
                    print("=" * 30)
    
    def main():
        # 1. 先获取所有页面的url
        for x in range(1,100):
            page_url = "https://www.doutula.com/photo/list/?page="+str(x)
            PAGE_URLS.append(page_url)
    
        # 五个生产者线程
        for x in range(5):
            th = threading.Thread(target=producer)
            th.start()
    
        # 五个消费者线程
        for x in range(5):
            th = threading.Thread(target=consumer)
            th.start()
    
    if __name__ == '__main__':
        main()

    多进程优化:

    #encoding: utf-8
    import time
    import threading
    import random
    
    gMoney = 0
    # 只要想要在多线程中操作全局变量,那么就需要在操作的时候进行上锁
    gLock = threading.Lock()
    
    def greet(index):
        print("helloworld-%d"%index)
        time.sleep(0.5)
    
    
    def line_run():
        for x in range(5):
            greet(x)
    
    def thread_run():
        for x in range(5):
            th = threading.Thread(target=greet,args=[x])
            th.start()
    
    def produter():
        global gMoney
        while True:
            money = random.randint(0,100)
            gLock.acquire()
            gMoney += money
            gLock.release()
            print("%s生产者生产了%s元钱,剩余%s元钱"%(threading.current_thread(),money,gMoney))
            time.sleep(0.5)
    
    def consumer():
        global gMoney
        while True:
            money = random.randint(0,100)
            gLock.acquire()
            if gMoney >= money:
                gMoney -= money
                print("%s消费者消费了%s元钱,剩余%s元钱"%(threading.current_thread(),money,gMoney))
            else:
                print("%s消费者想消费%s元钱,但是余额不足!剩余%s元钱!"%(threading.current_thread(),money,gMoney))
            gLock.release()
            time.sleep(0.5)
    
    
    
    if __name__ == '__main__':
        # line_run()
        # thread_run()
        for x in range(5):
            th = threading.Thread(target=produter)
            th.start()
    
        for x in range(5):
            th = threading.Thread(target=consumer)
            th.start()
  • 相关阅读:
    【数据库_Postgresql】实体类映射问题之不执行sql语句
    【数据库_Postgresql】数据库主键自增长之加序列和不加序列2种方法
    【明哥报错簿】之 mybatis异常invalid comparison: java.util.Date and java.lang.String
    【明哥报错簿】可以访问jsp但是访问不到controller
    【明哥报错簿】tomcat 安装时出现 Failed to install Tomcat7 service
    【Java】SVN下载maven项目到eclipse之后,项目红叉,pom.xml出现Missing artifact fakepath:dubbo:jar:2.8.5等缺少jar包情况
    Mysql学习笔记之常用数据类型 (转)
    MySQL--INFORMATION_SCHEMA COLUMNS表
    mysql int(3)与int(11)的区别
    mysql中utf8_bin、utf8_general_ci、utf8_general_cs编码区别
  • 原文地址:https://www.cnblogs.com/yszr/p/14981655.html
Copyright © 2011-2022 走看看