zoukankan      html  css  js  c++  java
  • 实现爬取图片

    from requests_html import HTMLSession


    session = HTMLSession()

    BASE_URL = 'https://www.ivsky.com'

    # 获取图片页码链接

    def get_page_url():
    for i in range(1,5): # 事先知道一共有多少页,将参数修改即可
    yield 'https://www.ivsky.com/tupian/ziranfengguang/index_{}.html'.format(i)


    # 获取总图的链接 ==》》 单个图的所有

    # 测试:
    # r = session.get(url='https://www.ivsky.com/tupian/ziranfengguang/index_1.html')
    #
    # BASE_URL = 'https://www.ivsky.com'
    # element_list = r.html.find('.il_img a')
    # for element in element_list:
    # # print(element.attrs.get('href'))
    # a_url = BASE_URL + element.attrs.get('href')
    # print(a_url)
    # title = element.attrs.get('title')
    # # 进入到具体的图片内部
    # h = session.get(url=a_url)
    # element_list = h.html.find('.il_img img')
    # for element in element_list:
    # url = element.attrs.get('src')[15:]
    # url_detail = BASE_URL + url
    # print(url_detail)

    def get_url_page(url):
    r = session.get(url=url)
    element_list = r.html.find('.il_img a')
    for element in element_list:
    a_url = BASE_URL + element.attrs.get('href')
    title = element.attrs.get('title')
    # 进入到具体的图片内部
    h = session.get(url=a_url)
    element_list = h.html.find('.il_img img')
    length = len(element_list) # 判断长度此处无用!
    leng = 0
    for element in element_list:
    url = element.attrs.get('src')[15:]
    url_detail = BASE_URL + url
    leng += 1
    name = title + '第{}张'.format(str(leng))
    save(url_detail,title,name)

    # 文件夹下的名字有所区别 即风景图片/具体的标题/单个的图片.png,因此传name字段过去

    import os
    def save(url,title,name):
    base_url = '风景图片'
    file_path = os.path.join(base_url,title)
    if not os.path.exists(file_path):
    os.makedirs(file_path)

    file_path2 = os.path.join(file_path,name + '.png')
    r = session.get(url=url)

    with open(file_path2,'wb')as f:
    f.write(r.content)
    print('{}图片保存成功'.format(name))

    if __name__ == '__main__':
    for page_url in get_page_url():
    get_url_page(page_url)


    # 可以考虑单行打印进度条 前提是知道所有的图片长度不好弄
    # from tqdm import tqdm
    # import time
    #
    # pbar = tqdm(total=100,desc='michael')
    # for i in range(100):
    # pbar.update(1)
    # time.sleep(0.05)
    # pbar.close()

  • 相关阅读:
    Ext.dataGroupingStore/JsonStore/SimpleStore
    转:LinQ操作汇总(From CSharpSamples)
    XSLT教程 比较全的
    使用ASP.Net Forms模式实现WebService身份验证
    关于DataRow的RowState和RowVersion
    C#日志工具汇总
    转 Using log4net,
    js//初始话日期
    两个数据库表的连接 查询
    ExtJS入门之三 查询
  • 原文地址:https://www.cnblogs.com/changwenjun-666/p/11323262.html
Copyright © 2011-2022 走看看