zoukankan      html  css  js  c++  java
  • 爬天极网多线程.py

    import os
    import requests # 发送请求
    from bs4 import BeautifulSoup # 解析文本
    import re
    import threading

    base_path = os.path.dirname(os.path.abspath(__file__))
    img_path = os.path.join(base_path, '3')

    # 拿到前五页url
    url_heard = "http://pic.yesky.com"
    url_start = "/c/6_20491_1.shtml"
    response = requests.get(url=url_heard+url_start)
    soup = BeautifulSoup(response.text, "html.parser")
    div_obj = soup.find(name="div", attrs={"class": "flym"})
    a_list = div_obj.find_all(name='a')
    href_list = []
    for a in a_list:
    if a.get('href') not in href_list:
    href_list.append(a.get('href'))
    href_list.insert(0, url_start)

    def func(url_heard, img_path, u):
    response1 = requests.get(url=url_heard+u)
    soup1 = BeautifulSoup(response1.text, 'html.parser') # 将请求结果交给bs4解析
    div_obj1 = soup1.find(name='div', attrs={"class": "lb_box"}) # 经过分析之后,定位到指定div

    list_dd = div_obj1.find_all(name='dd')
    for dd in list_dd: # 每一张图片的dl
    a_obj = dd.find('a')

    # 拼接文件夹的路径,并创建文件夹
    title = re.sub('[/:*?"<>|]', '_', a_obj.text)
    dir_path = os.path.join(img_path, title)
    if not os.path.isdir(dir_path): # 判断文件是否存在
    os.mkdir(dir_path)

    a_response = requests.get(a_obj.get('href'))
    a_response.encoding = 'GBK' # 标题汉字
    soup2 = BeautifulSoup(a_response.text, 'html.parser')
    div_obj2 = soup2.find(name='div', attrs={"class": "overview"})
    print(div_obj2)

    try:
    img_list = div_obj2.find_all(name='img')
    for img in img_list:
    img_src = img.get("src")
    img_response = requests.get(img_src.replace('113x113', '740x-')) # ******************此网站找到的捷径规律
    file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])
    with open(file_path, 'wb') as f:
    f.write(img_response.content)
    except Exception as e:
    pass
    t = []
    n = 1
    for u in href_list:
    t.append(threading.Thread(target=func, name="线程"+str(n), args=(url_heard, img_path, u)))
    n += 1

    for i in t:
    i.start()

    效果如下:

  • 相关阅读:
    Maven入门
    Windows Java安装
    cdh安装spark遇到的几个BUG
    SQL Server创建存储过程——动态SQL
    IDEA搭建scala开发环境开发spark应用程序
    liunx命令
    java常用 api
    缓存一致性问题
    git 命令
    nginx
  • 原文地址:https://www.cnblogs.com/zhang-da/p/12210008.html
Copyright © 2011-2022 走看看