zoukankan      html  css  js  c++  java
  • 用pyspider爬淘宝MM照片

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2016-12-09 15:24:54
    # Project: taobaomm
    
    from pyspider.libs.base_handler import *
     
    PAGE_START = 1
    PAGE_END = 30
    DIR_PATH = 'D:mzitummmm'
     
     
    class Handler(BaseHandler):
        crawl_config = {
        }
     
        def __init__(self):
            self.base_url = 'https://mm.taobao.com/json/request_top_list.htm?page='
            self.page_num = PAGE_START
            self.total_num = PAGE_END
            self.deal = Deal()
     
        def on_start(self):
            while self.page_num <= self.total_num:
                url = self.base_url + str(self.page_num)
                self.crawl(url, callback=self.index_page, validate_cert = False)
                self.page_num += 1
     
        def index_page(self, response):
            for each in response.doc('.lady-name').items():
                self.crawl(each.attr.href, callback=self.detail_page, validate_cert = False, fetch_type='js')
     
        def detail_page(self, response):
            domain = response.doc('.mm-p-domain-info li > span').text()
            if domain:
                page_url = 'https:' + domain
                self.crawl(page_url, callback=self.domain_page, validate_cert = False)
     
        def domain_page(self, response):
            name = response.doc('.mm-p-model-info-left-top dd > a').text()    ##获取姓名
            dir_path = self.deal.mkDir(name)     ##创建文件夹
            brief = response.doc('.mm-aixiu-content').text()    ##获取显示的文本内容
            if dir_path:    ##如果文件夹存在
                imgs = response.doc('.mm-aixiu-content img').items()  ##定位到图片
                count = 1  ##定义count 用于后面的自增
                self.deal.saveBrief(brief, dir_path, name)  ##创建一个文本,命名为"名字".txt,将brief中的内容写入文件夹中
                for img in imgs:
                    url = img.attr.src  ##获取到图片的url
                    if url:
                        extension = self.deal.getExtension(url)  ##调用deal中的getExtension  作用是!!!取到.jpg
                        file_name = name + str(count) + '.' + extension
                        count += 1
                        self.crawl(img.attr.src, callback=self.save_img, validate_cert = False,
                                   save={'dir_path': dir_path, 'file_name': file_name})
     
        def save_img(self, response):
            content = response.content
            dir_path = response.save['dir_path']
            file_name = response.save['file_name']
            file_path = dir_path + '/' + file_name
            self.deal.saveImg(content, file_path)
     
     
    import os
     
    class Deal:
        def __init__(self):
            self.path = DIR_PATH
            if not self.path.endswith('/'):
                self.path = self.path + '/'
            if not os.path.exists(self.path):
                os.makedirs(self.path)
     
        def mkDir(self, path):
            path = path.strip()
            dir_path = self.path + path
            exists = os.path.exists(dir_path)
            if not exists:
                os.makedirs(dir_path)
                return dir_path
            else:
                return dir_path
     
        def saveImg(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
     
        def saveBrief(self, content, dir_path, name):
            file_name = dir_path + "/" + name + ".txt"
            f = open(file_name, "w+")
            f.write(content.encode('utf-8'))
     
        def getExtension(self, url):
            extension = url.split('.')[-1]
            return extension
  • 相关阅读:
    2020牛客寒假算法基础集训营5 F 碎碎念
    性能测试过程中oracle数据库报ORA-27301 ORA-27302错
    Linux裸设备管理详解--
    GoldenGate 之 Bounded Recovery说明
    关于Oracle GoldenGate中Extract的checkpoint的理解 转载
    SMON: Parallel transaction recovery tried 引发的问题--转载
    用直接路径(direct-path)insert提升性能的两种方法
    深入理解Oracle的并行操作-转载
    oracle大表添加字段default经验分享
    Oracle Hang分析--转载
  • 原文地址:https://www.cnblogs.com/tangbinghaochi/p/6150149.html
Copyright © 2011-2022 走看看