zoukankan      html  css  js  c++  java
  • 爬取下载图片,代码写得也是难看死,有时间优化吧

    # -*-coding:utf-8-*-
    from lxml import etree
    import requests
    import sys, os, re


    class Meizitu(object):
    def __init__(self):
    # 设置序号为图片名
    self.i = 1
    # 构建url
    self.url = 'http://www.meizitu.com/a/more_{}.html'
    # 构建header
    self.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
    }
    self.pattern = re.compile('http://www.meizitu.com/a/d+.html', re.S) # 使用正则匹配url
    self.img_url = []

    def get_page(self, url):
    response = requests.get(url, headers=self.headers)
    # print(response.encoding) # 返回内容编码编码格式
    return response.content

    def parse_data(self, detail_data):
    # 取得图片页url
    str_data1 = self.pattern.findall(detail_data.decode('utf-8', 'ignore'))
    return str_data1

    def pic_info(self, image_list):
    # 详细页面
    img_url = []
    for url in image_list:
    img_page = self.get_page(url)
    html = etree.HTML(img_page)
    if html is not None:
    img_detial_list = html.xpath('//img/@src')
    if img_detial_list:
    for i in img_detial_list:
    if i not in self.img_url:
    self.img_url.append(i) # 去重存到self.img_url
    img_url.append(i)

    def download(self):
    # print(image_list)
    if not os.path.exists('images'):
    os.makedirs('images')

    for url in self.img_url:
    print(url)
    data = self.get_page(url)
    with open('images' + os.sep + str(self.i) + '.jpg', 'wb+') as f:
    f.write(data)
    self.i = self.i + 1

    def run(self):
    page = input('输入页码')
    self.url = 'http://www.meizitu.com/a/more_{}.html'.format(page)
            print(self.url)
    detail_data = self.get_page(self.url)
    # 抽取图片url
    image_list = self.parse_data(detail_data)
    self.pic_info(image_list)
    self.download()


    if __name__ == '__main__':
    meizi = Meizitu()
    meizi.run()
  • 相关阅读:
    堆的实现(图片演示+文字讲解)
    数据结构之堆的插入、取值、排序(细致讲解+图片演示)
    插入排序
    (简单易懂)Java的快速失败(fail-fast)与安全失败,源码分析+详细讲解
    Eclipse安装Hibernate插件快速生成配置文件
    Hibernate级联操作解密(inverse和cascade)
    http协议详解
    javaweb中的关于编码问题总结
    Hyperparameter tuning
    win10修改jupyter notebook默认路径
  • 原文地址:https://www.cnblogs.com/jianxiaoguo/p/7646907.html
Copyright © 2011-2022 走看看