zoukankan      html  css  js  c++  java
  • python---爬取图片数千张(分门别类)

    目标网址:http://www.netbian.com/

    代码(待优化)如下:

    #
    # date:2021/1/14
    # author:eihouwang
    # 1.获取分类标签,形成标签网址列表
    # 2.单个标签网址单页图片
    # 3.单个标签全部页图片
    # 4.获取全部标签,全部图片

    import re
    from urllib.parse import urljoin
    import requests
    from bs4 import BeautifulSoup
    import os
    import time

    url = "http://www.netbian.com"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 "
    "Safari/537.36 SE 2.X MetaSr 1.0"}
    dirnames=[]
    dirs = os.listdir('E:\czxt\')
    if len(dirs)>0:
    for x in dirs:
    dirnames.append('E:czxt\'+x+'\')
    #print(dirnames)

    #请求网页
    def get_html(url):
    r = requests.get(url, headers=headers)
    if r.status_code == 200:
    html = r.text
    else:
    return None
    return html

    #获取分类标签列表
    def get_lable_url_list(url):

    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    list_a = soup.findAll('a')
    list_url_lable = []
    num = 0
    for i in range(3, 28):
    num += 1
    url_new = urljoin(url, list_a[i]['href'])
    try:
    path = "E:\czxt\" + url_new.split('/')[-2] + "\"
    os.mkdir(path)
    dirnames.append(path)
    except:
    pass
    list_url_lable.append(url_new)
    #print(sorted(list_url_lable))
    #dirnames=sorted(dirnames)
    #print(num, end='-->')
    #print(url_new, path)
    return sorted(list_url_lable),sorted(dirnames)

    #获取单页图片
    def get_one_page(url,k=0):
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    list = soup.findAll('img')
    #print(list)
    for i in list:
    str1 = i['src']
    try:
    if len(str1) >= 89:
    str_list = str1.split('small')
    str_r = str_list[1].split('.')[0][:32] + ".jpg"
    pic_url = str_list[0] + str_r
    path = sorted(dirnames)[k] + str_r
    else:
    pic_url=str1
    path=sorted(dirnames)[k]+str1.split('/')[-1]
    r1 = requests.get(pic_url, headers=headers)
    # print(dirnames)
    #path = sorted(dirnames)[k] + str_r
    print(path)
    store_one_page(path, r1.content)
    except:
    pass


    #存储图片
    def store_one_page(path, content):
    with open(path, 'wb') as f:
    f.write(content)
    f.close()

    #获取单标签网页列表
    def get_onelable_url_list(url):
    one_lable_url_list = []
    html = get_html(url)
    try:
    totalpages = re.findall('/span(.*?)>(d+)</a(.*?)class', html)[0][1] # 获取分类标签下总页数

    for i in range(1, int(totalpages) + 1):
    if i == 1:
    url_new = url
    else:
    url_new = url + 'index_' + str(i) + '.htm'
    one_lable_url_list.append(url_new)
    except:
    pass
    # print(one_lable_url_list)
    return one_lable_url_list


    #lable_url_list,dirnames=get_lable_url_list(url)
    # get_onelable_url_list(url)
    #get_one_page("http://www.netbian.com/rili/")
    def main():
    lable_url_list,dirnames = get_lable_url_list(url)
    print(dirnames)
    length=len(lable_url_list)
    #print(lable_url_list[0])
    for k in range(0,length):
    print('正方访问{}---分类标签网址{}'.format(k,lable_url_list[k]))
    one_lable_url_list=get_onelable_url_list(lable_url_list[k])

    for j in one_lable_url_list:
    print('正在获({})页资源'.format(j))
    get_one_page(j,k=k)
    print('获取({})页资源完毕'.format(j))
    main()
  • 相关阅读:
    基于log4net的帮助类Log
    log4Net不能成功生成日志问题(关于配置错误)
    js 时间构造函数
    启动调试IIS时,vs无法在 Web 服务器上启动调试。Web 服务器未能找到请求的资源。 有关详细信息,请单击“帮助”。
    XmlException: 名称不能以“<”字符(十六进制值 0x3C)开头
    poj 3040 Allowance
    1144 数星星 (树状数组)
    18121 排排坐看电影
    18124 N皇后问题
    18025 小明的密码
  • 原文地址:https://www.cnblogs.com/eihouwang/p/14287214.html
Copyright © 2011-2022 走看看