zoukankan      html  css  js  c++  java
  • 进程池爬取汽车之家.py

    import time
    import requests
    #线程池、进程池
    from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
    #多线程:
    from threading import Thread
    #多进程:
    from multiprocessing import Process
    #进程池:
    from multiprocessing import Pool
    from bs4 import BeautifulSoup
    #导入cpu_count查看CPU信息获取本机CPU核数:
    from multiprocessing import cpu_count

    def task(url):
    #format格式化页数:
    response = requests.get("https://www.autohome.com.cn/all/{}/#liststart".format(url))
    #获取编码:
    # print(response.encoding)
    #转码:
    response.encoding = "gbk"
    #获取文本:
    text = response.text
    #解析文本:
    soup = BeautifulSoup(text,"html.parser")
    #获取div:
    div = soup.find(name = "div",attrs={"id":"auto-channel-lazyload-article"})
    #获取img:
    img_list = div.find_all(name = "img")
    #获取第一个链接和长度:
    # print(img_list[0],len(img_list))
    print(response.url)
    for i in img_list:
    print("https:" + i.get("src"))
    break

    if __name__ == '__main__':
    """进程池一般开CPU核数、线程池开CPU核数的2-5倍、"""
    # print(cpu_count())
    stat = time.time()
    #开启进程池、4核是4进程乘以2总共是8个进程:
    p = ProcessPoolExecutor(max_workers=cpu_count())
    for i in range(1,110):
    p.submit(task,i)
    p.shutdown()
    print("耗时:%s" %(time.time() - stat))
  • 相关阅读:
    函数及习题
    数组和集合
    数组和集合实例
    普通集合和泛型集合的区别,哈希表和字典表的区别,队列和堆栈的区别以及堆和栈的区别。
    c#时间表示
    c#正则表达式
    js正则实例
    习题实例
    c#数据类型
    简单控件
  • 原文地址:https://www.cnblogs.com/zhang-da/p/12215525.html
Copyright © 2011-2022 走看看