zoukankan      html  css  js  c++  java
  • 爬取图片(二)

    源码:

     1 import requests
     2 from lxml import etree
     3 import os
     4 
     5 
     6 # 获取图集地址
     7 def get_url(page,headers):
     8     url = 'http://www.mzitu.com/page/{}/'.format(page)
     9     response = requests.get(url,headers=headers)
    10     html_ele = etree.HTML(response.text)
    11     ele_list = html_ele.xpath('//ul[@id="pins"]/li')
    12     url_tuple_list = []
    13     for ele in ele_list:
    14         url = ele.xpath('./span/a/@href')[0]
    15         name = ele.xpath('./span/a')[0].text
    16         url_tuple = (url,name)
    17         url_tuple_list.append(url_tuple)
    18     return url_tuple_list
    19 
    20 
    21 # 下载图片
    22 def get_pics(url,headers,name):
    23     # 创建文件夹
    24     dirs_name = 'www.mzitu.com/' + name
    25     if not os.path.exists(dirs_name):
    26         os.makedirs(dirs_name)
    27 
    28     # 获取最大图片页数
    29     response = requests.get(url,headers=headers)
    30     html_ele = etree.HTML(response.text)
    31     max_page = html_ele.xpath('//div[@class="pagenavi"]/a/span')[-2].text
    32     # print(type(max_page))
    33     # 存储图片
    34     for page in range(1,int(max_page)+1):
    35         if page < 10:
    36             url_page = url + '/0' +str(page)
    37         else:
    38             url_page = url + '/' + str(page)
    39         # print(url_page)
    40         response = requests.get(url_page,headers=headers)
    41         html_ele = etree.HTML(response.text)
    42         pic_url = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
    43         # print(pic_url)
    44         pic_bytes = requests.get(pic_url,headers=headers)
    45         filename = dirs_name + '/' + pic_url.split('/')[-1]
    46         if not os.path.exists(filename):
    47             with open(filename, 'wb') as f:
    48                 f.write(pic_bytes.content)
    49             print(filename)
    50 
    51 
    52 if __name__ == '__main__':
    53     headers = {
    54         "Referer": "http://www.mzitu.com",
    55         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    56     }
    57     for page in range(1,3):
    58         url_tuple_list = get_url(page,headers)
    59         for url,name in url_tuple_list:
    60             get_pics(url,headers,name)
  • 相关阅读:
    程序活动记录&程序调试&多线程编程
    数据结构与算法
    C/C++
    Information Retrieval --- Retrieval Comment
    Information Retrieval --- Clustering
    Information Retrieval --- Classification
    Information Retrieval --- Web Search
    Information Retrieval --- Retrieval Enforce:Relevance Feedback & Query Expansion
    Information Retrieval --- Retrieval Model
    ubuntu server 安装vnc
  • 原文地址:https://www.cnblogs.com/zhxd-python/p/9501304.html
Copyright © 2011-2022 走看看