zoukankan      html  css  js  c++  java
  • 爬取下载图片,代码写得也是难看死,有时间优化吧

    # -*-coding:utf-8-*-
    from lxml import etree
    import requests
    import sys, os, re


    class Meizitu(object):
    def __init__(self):
    # 设置序号为图片名
    self.i = 1
    # 构建url
    self.url = 'http://www.meizitu.com/a/more_{}.html'
    # 构建header
    self.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
    }
    self.pattern = re.compile('http://www.meizitu.com/a/d+.html', re.S) # 使用正则匹配url
    self.img_url = []

    def get_page(self, url):
    response = requests.get(url, headers=self.headers)
    # print(response.encoding) # 返回内容编码编码格式
    return response.content

    def parse_data(self, detail_data):
    # 取得图片页url
    str_data1 = self.pattern.findall(detail_data.decode('utf-8', 'ignore'))
    return str_data1

    def pic_info(self, image_list):
    # 详细页面
    img_url = []
    for url in image_list:
    img_page = self.get_page(url)
    html = etree.HTML(img_page)
    if html is not None:
    img_detial_list = html.xpath('//img/@src')
    if img_detial_list:
    for i in img_detial_list:
    if i not in self.img_url:
    self.img_url.append(i) # 去重存到self.img_url
    img_url.append(i)

    def download(self):
    # print(image_list)
    if not os.path.exists('images'):
    os.makedirs('images')

    for url in self.img_url:
    print(url)
    data = self.get_page(url)
    with open('images' + os.sep + str(self.i) + '.jpg', 'wb+') as f:
    f.write(data)
    self.i = self.i + 1

    def run(self):
    page = input('输入页码')
    self.url = 'http://www.meizitu.com/a/more_{}.html'.format(page)
            print(self.url)
    detail_data = self.get_page(self.url)
    # 抽取图片url
    image_list = self.parse_data(detail_data)
    self.pic_info(image_list)
    self.download()


    if __name__ == '__main__':
    meizi = Meizitu()
    meizi.run()
  • 相关阅读:
    multidownloadXkcd 多线程抓图
    51job_selenium测试2
    51job_selenium测试
    python爬虫 前程无忧网页抓取
    化工pdf下载
    Velocity写法注意
    Velocity中文乱码问题解决方法
    velcoity使用说明:foreach指令
    strults2标签s:set的用法
    struts提交action乱码
  • 原文地址:https://www.cnblogs.com/jianxiaoguo/p/7646907.html
Copyright © 2011-2022 走看看