zoukankan      html  css  js  c++  java
  • 爬取4k图片网图片

    注意更改路径

      1 import os
      2 import requests
      3 from lxml import etree
      4 from urllib.request import urlopen, Request
      5 import time
      6 
      7 class BiAnImage():
      8     def __init__(self):
      9         self.base_url = "http://pic.netbian.com"
     10         self.header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
     11     def get_html(self, url):
     12         response = requests.get(url, self.header)
     13         if response.status_code == 200:
     14             response.encoding = response.apparent_encoding
     15             return response.text
     16         return None
     17     def get_url_1_list(self, html_1):
     18         url_1_items = []
     19         title_1_items = []
     20         x_html = etree.HTML(html_1)
     21         url_list = x_html.xpath('//div[@id="main"]/div[2]/a/@href')
     22         title_list = x_html.xpath('//div[@id="main"]/div[2]/a/text()')
     23         for url, title in zip(url_list, title_list):
     24             url_1_items.append(self.base_url + url)
     25             title_1_items.append(title)
     26         return title_1_items, url_1_items
     27     def get_url_2_list(self, html_2):
     28         url_2_items = []
     29         title_2_items = []
     30         x_html = etree.HTML(html_2)
     31         url_list = x_html.xpath('//ul[@class="clearfix"]/li/a/@href')
     32         title_list = x_html.xpath('//ul[@class="clearfix"]/li/a/b/text()')
     33         last_page = x_html.xpath('//a[text()="下一页"]/preceding-sibling::a[1]/text()')  # 直接查找下一页 => 上一个元素
     34         for url, title in zip(url_list, title_list):
     35             url_2_items.append(self.base_url + url)
     36             title_2_items.append(title)
     37         return url_2_items, title_2_items, last_page
     38     def get_image_url(self, image_html):
     39         x_image_html = etree.HTML(image_html)
     40         image_url = x_image_html.xpath('//a[@id="img"]/img/@src')
     41         return self.base_url + image_url[0]
     42     def save_image(self, save_path, image_name, image_url):
     43         req = Request(url=image_url, headers=self.header)
     44 
     45         content = urlopen(req).read()
     46         img_name = image_name.replace(' ', '') + image_url[-4:]
     47         with open(save_path + img_name, 'wb') as f:
     48             f.write(content)
     49             print(img_name, "下载完成...")
     50     def run(self):
     51         # 获取所有分类标题, 链接
     52         html = self.get_html(self.base_url)
     53         title_1_items, url_1_items = self.get_url_1_list(html)
     54         for title_1, url_1 in zip(title_1_items, url_1_items):
     55             if title_1 == "4K动漫":
     56             # if title_1 == "4K风景": TODO: 这里加一个判断就可以下载指定分类下的图片
     57                 html_2 = self.get_html(url_1)
     58                 url_2_items, title_2_items, last_page = self.get_url_2_list(html_2)
     59 
     60                 # 通过拿到分类页面中的last_page, 获取该分类下所有页面链接
     61                 for page in range(1, int(last_page[0])):
     62                     if page == 1:
     63                         more_url_1 = url_1  # more_url_1 是每个分类下每一页的链接
     64                     else:
     65                         more_url_1 = url_1 + "index_{}.html".format(page)
     66                     detail_html = self.get_html(more_url_1)
     67                     url_2_items, title_2_items, last_page = self.get_url_2_list(detail_html)
     68 
     69                     # 获取当前页面中所有图片链接
     70                     for url_2, title_2 in zip(url_2_items, title_2_items):
     71 
     72                         # print(title_1, url_1, last_page[0], more_url_1, title_2, url_2)
     73                         pictures = "C:/Users/25766/AppData/Local/Programs/Python/Python38/imgs/"
     74 
     75                         time.sleep(2)
     76                         # 在这里对下载的文件进行分类, 如果文件不存在, 就直接新建一个文件夹
     77                         if os.path.exists(pictures + title_1) is False:
     78                             os.makedirs(pictures + title_1)
     79                         save_path = pictures + title_1 + "/"
     80                         image_html = self.get_html(url_2)
     81                         img_url = self.get_image_url(image_html)
     82                         self.save_image(save_path, title_2, img_url)
     83                         #print(save_path)
     84 
     85                           # 跳出一个页面中所有图片链接
     86                      # 跳出一个分类的所有页面
     87                  # 跳出所有分类
     88 
     89 bian = BiAnImage()
     90 bian.run()
     91 
  • 相关阅读:
    使用Docker在本地搭建Hadoop分布式集群
    微博推荐 第三个map 源码
    对象
    http无状态(stateless)
    理解http的无连接
    http响应报文之首部行
    http响应报文之状态行
    http响应报文
    http请求报文之首部行
    http请求之请求数据
  • 原文地址:https://www.cnblogs.com/rstz/p/12704537.html
Copyright © 2011-2022 走看看