zoukankan      html  css  js  c++  java
  • 用pyspider爬淘宝MM照片

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2016-12-09 15:24:54
    # Project: taobaomm
    
    from pyspider.libs.base_handler import *
     
    PAGE_START = 1
    PAGE_END = 30
    DIR_PATH = 'D:mzitummmm'
     
     
    class Handler(BaseHandler):
        crawl_config = {
        }
     
        def __init__(self):
            self.base_url = 'https://mm.taobao.com/json/request_top_list.htm?page='
            self.page_num = PAGE_START
            self.total_num = PAGE_END
            self.deal = Deal()
     
        def on_start(self):
            while self.page_num <= self.total_num:
                url = self.base_url + str(self.page_num)
                self.crawl(url, callback=self.index_page, validate_cert = False)
                self.page_num += 1
     
        def index_page(self, response):
            for each in response.doc('.lady-name').items():
                self.crawl(each.attr.href, callback=self.detail_page, validate_cert = False, fetch_type='js')
     
        def detail_page(self, response):
            domain = response.doc('.mm-p-domain-info li > span').text()
            if domain:
                page_url = 'https:' + domain
                self.crawl(page_url, callback=self.domain_page, validate_cert = False)
     
        def domain_page(self, response):
            name = response.doc('.mm-p-model-info-left-top dd > a').text()    ##获取姓名
            dir_path = self.deal.mkDir(name)     ##创建文件夹
            brief = response.doc('.mm-aixiu-content').text()    ##获取显示的文本内容
            if dir_path:    ##如果文件夹存在
                imgs = response.doc('.mm-aixiu-content img').items()  ##定位到图片
                count = 1  ##定义count 用于后面的自增
                self.deal.saveBrief(brief, dir_path, name)  ##创建一个文本,命名为"名字".txt,将brief中的内容写入文件夹中
                for img in imgs:
                    url = img.attr.src  ##获取到图片的url
                    if url:
                        extension = self.deal.getExtension(url)  ##调用deal中的getExtension  作用是!!!取到.jpg
                        file_name = name + str(count) + '.' + extension
                        count += 1
                        self.crawl(img.attr.src, callback=self.save_img, validate_cert = False,
                                   save={'dir_path': dir_path, 'file_name': file_name})
     
        def save_img(self, response):
            content = response.content
            dir_path = response.save['dir_path']
            file_name = response.save['file_name']
            file_path = dir_path + '/' + file_name
            self.deal.saveImg(content, file_path)
     
     
    import os
     
    class Deal:
        def __init__(self):
            self.path = DIR_PATH
            if not self.path.endswith('/'):
                self.path = self.path + '/'
            if not os.path.exists(self.path):
                os.makedirs(self.path)
     
        def mkDir(self, path):
            path = path.strip()
            dir_path = self.path + path
            exists = os.path.exists(dir_path)
            if not exists:
                os.makedirs(dir_path)
                return dir_path
            else:
                return dir_path
     
        def saveImg(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
     
        def saveBrief(self, content, dir_path, name):
            file_name = dir_path + "/" + name + ".txt"
            f = open(file_name, "w+")
            f.write(content.encode('utf-8'))
     
        def getExtension(self, url):
            extension = url.split('.')[-1]
            return extension
  • 相关阅读:
    数组子数组求最大值1
    乐游 游戏论坛开发第二阶段
    软件开发第一天
    团队开发
    解决libpython2.6.so.1.0: cannot open shared object file
    linux卸载Python3
    在Linux上安装Python3.7.1
    Pytest高级进阶之Fixture
    发现使用id定位元操作不了
    报错:Original error: Could not proxy command to remote server. Original error: Error: read ECONNRESET
  • 原文地址:https://www.cnblogs.com/tangbinghaochi/p/6150149.html
Copyright © 2011-2022 走看看