zoukankan      html  css  js  c++  java
  • 多线程爬虫

    threading模块

    import threading
    import time
    
    def coding():
        for i in range(3):
            print("正在写代码%s"%threading.current_thread())
            time.sleep(1)
    
    def drawing():
        for i in range(3):
            print("正在画画%s"%threading.current_thread())
            time.sleep(1)
    
    def main():
        # 创建一个子线程
        t1 = threading.Thread(target=coding, )
        t1.start()
        t2 = threading.Thread(target=drawing, )
        t2.start()
    
    
    if __name__ == '__main__':
        main()

    condition的生产者消费者模式

    # threading.Condition 继承threading.Lock
    import threading
    import random
    import time
    
    gMoney = 1000
    gCondition = threading.Condition()
    gTimes = 0
    gTotalTimes = 10
    
    class Producer(threading.Thread):
        def run(self):
            global gMoney
            global gTimes
            global gTotalTimes
            while True:
                money = random.randint(100,1000)
                gCondition.acquire()
                if gTimes >= gTotalTimes:
                    gCondition.release()
                    break
    
                gMoney += money
                gTimes += 1
                print("%s生产了%d的钱,现在总共有%d" % (threading.current_thread(), money, gMoney))
                gCondition.notify_all() #通知wait等待的线程
                gCondition.release()
                time.sleep(1)
    
    
    class Consumer(threading.Thread):
        def run(self):
            global gMoney
            while True:
                money = random.randint(100,1000)
                gCondition.acquire()
    
                while gMoney < money:
                    if gTimes > gTotalTimes:
                        gCondition.release()
                        return
                    print("%s,准备消费%d,剩余金额%d,不足!!!" % (threading.current_thread, money, gMoney))
                    gCondition.wait()
    
                gMoney -= money
                print("消费者%s,消费了%d,剩余金额%d"%(threading.current_thread,money,gMoney))
                gCondition.release()
                time.sleep(1)
    
    def main():
        for x in range(2):
            t = Producer(name="生产者%d"%x)
            t.start()
    
        for x in range(3):
            t = Consumer(name='消费者%d'%x)
            t.start()
    
    
    if __name__ == '__main__':
        main()

    lock版的生产者消费者模式

    import threading
    import random
    import time
    
    gMoney = 1000
    gLock = threading.Lock()
    gTimes = 0
    gTotalTimes = 10
    
    class Producer(threading.Thread):
        def run(self):
            global gMoney
            global gTimes
            global gTotalTimes
            while True:
                money = random.randint(100,1000)
                gLock.acquire()
    
                # if gTimes >= gTotalTimes:
                #     gLock.release()
                #     break
                # gMoney += money
                # gTimes += 1
                # print("%s生产了%d的钱,现在总共有%d" % (threading.current_thread(), money, gMoney))
                # gLock.release()
                # time.sleep(1)
    
                if gTimes < gTotalTimes:
                    gMoney += money
                    gTimes += 1
                    print("%s生产了%d的钱,现在总共有%d"%(threading.current_thread(),money,gMoney))
                    gLock.release()
                    time.sleep(1)
                else:
                    print("已经生产了10次, 停止生产")
                    gLock.release()
                    break
    
    
    
    class Consumer(threading.Thread):
        def run(self):
            global gMoney
            while True:
                money = random.randint(100,1000)
                gLock.acquire()
                if gMoney >= money:
                    gMoney -= money
                    print("消费者%s,消费了%d,还剩有%d"%(threading.current_thread(),money,gMoney))
                else:
                    if gTimes >= gTotalTimes:
                        gLock.release()
                        break
                    print("余额不足,当前金额是%d, 需要消费的金额是%d"%(gMoney,money))
                gLock.release()
                time.sleep(1)
    
    def main():
        for x in range(2):
            t = Producer(name="生产者%d"%x)
            t.start()
    
        for x in range(3):
            t = Consumer(name='消费者%d'%x)
            t.start()
    
    
    if __name__ == '__main__':
        main()

    queue的线程安全

    from queue import Queue
    import time
    import threading
    
    
    # q.put(2)
    # q.put(1)
    # q.put(3)
    #
    # print(q.qsize())
    # print(q.full())
    # print(q.empty())
    # print(q.get())
    
    def set_value(q):
        index = 0
        while True:
            q.put(index)
            index += 1
            time.sleep(3)
    
    def get_value(q):
        while True:
            print(q.get())
    
    def main():
        q = Queue(4)
        t1 = threading.Thread(target=set_value,args=[q])
        t2 = threading.Thread(target=get_value,args=[q])
        t1.start()
        t2.start()
    
    if __name__ == '__main__':
        main()

    threading类实现多线程

    import threading
    import time
    
    class CodingThread(threading.Thread):
        def run(self):
            for i in range(3):
                print("正在写代码%s"%threading.current_thread())
                time.sleep(1)
    
    class DrawingThread(threading.Thread):
        def run(self):
            for i in range(3):
                print("正在画画%s"%threading.current_thread())
                time.sleep(1)
    
    
    def main():
        # 创建一个子线程
        t1 = CodingThread()
        t1.start()
        t2 = DrawingThread()
        t2.start()
    
    
    if __name__ == '__main__':
        main()

    selenium关闭页面和浏览器

    from selenium import webdriver
    import time
    
    driver_path = r"G:Crawler and Datachromedriver.exe"
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com/')
    
    # 通过id 的方式获取
    inputTag = driver.find_element_by_id('kw')
    inputTag.send_keys('python')
    time.sleep(3)
    
    
    driver.close() # 关闭页面
    # driver.quit() # 关闭整个浏览器

    selenium页面等待

    from selenium import webdriver
    import time
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    
    driver_path = r"G:Crawler and Datachromedriver.exe"
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com/')
    
    # 等待10秒后找这个对应的id标签, 因为是错误的所以等待10秒后报错
    # 如果是正确的id 标签, 找到后直接继续执行, 不会等10 秒
    WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.ID,'shjdkah'))
    )

    selenium打开多个页面和页面间的切换

    from selenium import webdriver
    import time
    
    driver_path = r"G:Crawler and Datachromedriver.exe"
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com/')
    
    # 打开豆瓣
    driver.execute_script('window.open("https://www.douban.com/")')
    # 但是当前的driver还是停留在baidu页面
    print(driver.current_url)
    print(driver.window_handles)  # 窗口句柄 看看现在打开的窗口有什么
    driver.switch_to.window(driver.window_handles[1]) # 切换窗口
    print(driver.current_url)
    driver.close() # 关闭页面
    driver.switch_to.window(driver.window_handles[0]) # 切换窗口

    多线程共享全局变量

    import threading
    
    VALUE = 0
    gLock = threading.Lock() # 创建锁
    
    def add_value():
        global VALUE
        gLock.acquire()
        for x in range(1000000):
            VALUE += 1
        gLock.release()
        print("value,%d"%VALUE)
    
    def main():
        for x in range(2):
            t = threading.Thread(target=add_value)
            t.start()
    
    if __name__ == '__main__':
        main()

    selenium设置代理ip

    from selenium import webdriver
    
    
    options = webdriver.ChromeOptions()
    # 设置代理
    options.add_argument("--proxy-server-http://1.197.203.158:9999")
    
    driver_path = r"G:Crawler and Datachromedriver.exe"
    driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    driver.get('http://httpbin.org/ip')
  • 相关阅读:
    wxPython
    IT从业者职业规划
    成功开发iPhone软件的10个步骤
    开源认识:Jumony
    一个GG/MM的彩色验证码图片(C#)
    IT从业者学习规划
    为某一个对象动态添加属性
    .net 动态加载css与js
    文本框等css
    博客园配合得很好的代码插件
  • 原文地址:https://www.cnblogs.com/kenD/p/11123543.html
Copyright © 2011-2022 走看看