zoukankan      html  css  js  c++  java
  • 实现爬取图片

    from requests_html import HTMLSession


    session = HTMLSession()

    BASE_URL = 'https://www.ivsky.com'

    # 获取图片页码链接

    def get_page_url():
    for i in range(1,5): # 事先知道一共有多少页,将参数修改即可
    yield 'https://www.ivsky.com/tupian/ziranfengguang/index_{}.html'.format(i)


    # 获取总图的链接 ==》》 单个图的所有

    # 测试:
    # r = session.get(url='https://www.ivsky.com/tupian/ziranfengguang/index_1.html')
    #
    # BASE_URL = 'https://www.ivsky.com'
    # element_list = r.html.find('.il_img a')
    # for element in element_list:
    # # print(element.attrs.get('href'))
    # a_url = BASE_URL + element.attrs.get('href')
    # print(a_url)
    # title = element.attrs.get('title')
    # # 进入到具体的图片内部
    # h = session.get(url=a_url)
    # element_list = h.html.find('.il_img img')
    # for element in element_list:
    # url = element.attrs.get('src')[15:]
    # url_detail = BASE_URL + url
    # print(url_detail)

    def get_url_page(url):
    r = session.get(url=url)
    element_list = r.html.find('.il_img a')
    for element in element_list:
    a_url = BASE_URL + element.attrs.get('href')
    title = element.attrs.get('title')
    # 进入到具体的图片内部
    h = session.get(url=a_url)
    element_list = h.html.find('.il_img img')
    length = len(element_list) # 判断长度此处无用!
    leng = 0
    for element in element_list:
    url = element.attrs.get('src')[15:]
    url_detail = BASE_URL + url
    leng += 1
    name = title + '第{}张'.format(str(leng))
    save(url_detail,title,name)

    # 文件夹下的名字有所区别 即风景图片/具体的标题/单个的图片.png,因此传name字段过去

    import os
    def save(url,title,name):
    base_url = '风景图片'
    file_path = os.path.join(base_url,title)
    if not os.path.exists(file_path):
    os.makedirs(file_path)

    file_path2 = os.path.join(file_path,name + '.png')
    r = session.get(url=url)

    with open(file_path2,'wb')as f:
    f.write(r.content)
    print('{}图片保存成功'.format(name))

    if __name__ == '__main__':
    for page_url in get_page_url():
    get_url_page(page_url)


    # 可以考虑单行打印进度条 前提是知道所有的图片长度不好弄
    # from tqdm import tqdm
    # import time
    #
    # pbar = tqdm(total=100,desc='michael')
    # for i in range(100):
    # pbar.update(1)
    # time.sleep(0.05)
    # pbar.close()

  • 相关阅读:
    20170612测试
    vijos1453曼哈顿距离
    vijos1153 猫狗大战
    vijos1037搭建双塔
    dijkstra+priority_queue+vector
    BZOJ1507: [NOI2003]Editor
    dinic模板
    旅行-树形DP
    51nod1799-二分答案
    51nod1791-合法括号子段
  • 原文地址:https://www.cnblogs.com/changwenjun-666/p/11323262.html
Copyright © 2011-2022 走看看