zoukankan      html  css  js  c++  java
  • 淘女郎相册爬虫(Python编写)

    # *-* coding:utf-8 *-*
    
    __author__ = 'YS'
    
    import urllib2
    import urllib
    import re
    import json
    import os
    import time
    
    #抓取淘女郎的图片,淘女郎地址:https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.22495f9f1lYEAb
    class MMSpider:
        def __init__(self, timeout=3, albumLimit=200, picLimit=500, sleepPicCount=100, savePath='pythonspider/'):
            self.__headers = {
                'User-Agent':'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
            }
            #抓取时间超时设置
            self.timeout = timeout
            #抓取的相册个数限制
            self.albumLimit = albumLimit
            #获取MM列表的地址
            self.__mmListUrl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
            #获取相册列表的地址
            self.__albumListUrl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=:userId&page=:page'
            #获取相册具体相片的地址
            self.__albumDetailUrl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=:userId&album_id=:albumId&page=:page'
            #MM详情页面地址
            self.__personUrl = 'https://mm.taobao.com/self/aiShow.htm?userId=:userId'
            #抓取的文件存放路径
            self.savePath = savePath
            #每个MM的照片最多抓多少张
            self.picLimit = picLimit
            #抓取多少张图片时休息1秒
            self.sleepPicCount = sleepPicCount
    
            self.__mkdir(self.savePath)
    
        #获取页面内容,python中的异常继承关系: https://docs.python.org/3/library/exceptions.html#exception-hierarchy
        def __getContents(self, url, data=None, encoding=None, isjson=None):
            try:
                request = urllib2.Request(url, data, self.__headers)
                response = urllib2.urlopen(request, timeout=self.timeout)
                if encoding:
                    contents =  response.read().decode(encoding).encode('utf-8')
                else:
                    contents = response.read()
    
                return json.loads(contents,encoding='utf-8') if isjson else contents
            except urllib2.URLError,e:
                print '出错了' + e.reason
                return None
            except BaseException,e:
                print '其他错误'
                print e.args
                return None
    
        #获取MM列表
        def __getMMList(self, pageIndex):
            url = self.__mmListUrl
            data = urllib.urlencode({
                'currentPage':pageIndex,
                'pageSize':50
            })
            list = self.__getContents(url, data, encoding='gbk', isjson=True)
    
            if list is None:
                return None
            elif list['status'] != 1:
                return None
            return list['data']['searchDOList']
    
        #获取相册列表
        def __getAlbumList(self, mm):
            albumList = []
            baseUrl = self.__albumListUrl.replace(':userId',str(mm['userId']))
            indexUrl = baseUrl.replace(':page','1')
            pageCount = int(self.__getAlbumListPage(indexUrl))
            pageCount = pageCount if pageCount<=self.albumLimit else self.albumLimit
            
            for i in range(1, pageCount+1):
                listUrl = baseUrl.replace(':page', str(i))
                contents = self.__getContents(listUrl)
    
                if (contents is None):
                    continue
                pattern = re.compile('<h4><a href=".*?album_id=(.*?)&album_flag', re.S)
                items = re.findall(pattern, contents)
            
                for item in items:
                    albumList.append(item)
    
            return albumList
    
        #获取单个相册的相片
        def __getPicList(self, album, mm):
            lists = []
            baseUrl = self.__albumDetailUrl.replace(':userId', str(mm['userId'])).replace(':albumId',str(album))
            indexUrl = baseUrl.replace(':page','1')
            totalPage = self.__getPicPage(indexUrl)
            if totalPage is None:
                return None
            pages = range(1, int(totalPage)+1)
            for page in pages:
                url = baseUrl.replace(':page', '1')
                res = self.__getContents(url, isjson=True)
    
                if res is not None and res['isError']=='0':
                    for pic in res['picList']:
                        lists.append('http:'+pic['picUrl'])
                else:
                    print "获取结果失败,地址:"+url
    
            return lists
    
        #获取单个相册照片列表的总页数
        def __getPicPage(self, indexUrl):
            albuminfo = self.__getContents(indexUrl, encoding='gbk', isjson=True)
            if albuminfo is None:
                print '获取相册照片失败0,照片地址:'+indexUrl
                return None
    
            if albuminfo['isError'] != '0':
                print '获取相册照片失败1,照片地址:'+indexUrl
                return None
            totalPage = int(albuminfo['totalPage'])
    
            return totalPage
    
        #下载保存单个相册的照片,album表示相册id
        def __savePics(self, album, mm):
            print "正在保存"+mm['realName'].encode('utf-8')+'的相册,相册id为:'+album.encode('utf-8')
            pics =  self.__getPicList(album, mm)
            if pics is None:
                return
            index = 1
            for pic in pics:
                print "正在保存"+mm['realName'].encode('utf-8')+'的相册,相片地址为:'+pic.encode('utf-8')
                if index % self.sleepPicCount == 0:
                    print "休息一秒"
                    time.sleep(1)
                if index >= self.picLimit:
                    print mm["realName"].encode('utf-8') + ":已经保存"+str(self.picLimit)+"张辣"
                    return
                    
                saveDir = self.savePath + mm['realName'].encode('utf-8') + '/img'
                self.__mkdir(saveDir)
                fileName = saveDir + '/'+str(index)+'.jpg'
                self.__saveImg(pic, fileName)
    
                index +=1
    
        #获取相册的总页数
        def __getAlbumListPage(self, url):
            contents = self.__getContents(url)
            if contents:
                pattern = re.compile('id="J_Totalpage" value="(.*?)"', re.S)
                return re.search(pattern, contents).group(1)
            else:
                return None
    
        #保存MM的基本信息至本地的text文件夹
        def __saveMM(self, mm):
            print '正在保存'+mm['realName'].encode('utf-8')+'的信息'
            saveDir = self.savePath + mm['realName'] + '/text'
            self.__mkdir(saveDir)
            fileName = saveDir + '/info.txt'
            personUrl = self.__personUrl.replace(':userId', str(mm['userId']))
            contents = "姓名:%s
    城市:%s
    体重:%s
    身高:%s
    喜欢:%s
    个人主页:%s
    "%(mm['realName'].encode('utf-8'),mm['city'].encode('utf-8'),str(mm['weight']).encode('utf-8'),str(mm['height']).encode('utf-8'),str(mm['totalFavorNum']).encode('utf-8'),personUrl.encode('utf-8'))
            self.__saveTxtFile(contents, fileName)
    
        ##保存MM的头像到本地img文件夹
        def __saveMMAvatar(self, mm):
            print '正在保存'+mm['realName'].encode('utf-8')+'的头像'        
            saveDir = self.savePath + mm['realName'] + '/img'
            self.__mkdir(saveDir)
            fileName = saveDir + '/avatar.jpg'
            imgUrl = 'http:'+mm['avatarUrl']+'_240x240xz.jpg'  #获取小图
            self.__saveImg(imgUrl, fileName)
    
        #写入文本文件
        def __saveTxtFile(self, contents, fileName):
            handler = open(fileName, 'w')
            handler.write(contents)
            handler.close()
    
        #写入图片
        def __saveImg(self, imgUrl, fileName):
            contents = self.__getContents(imgUrl)
            if contents:
                handler = open(fileName, 'wb')
                handler.write(contents)
                handler.close()
            else:
                print '获取图片失败,图片地址:'+imgUrl.encode('utf-8')
    
        #创建存放图片或者文本文件的文件夹
        def __mkdir(self, saveDir):
            if os.path.exists(saveDir):
                return False
            else:
                os.makedirs(saveDir)
                return True
    
        #主入口方法
        def start(self, startPage, endPage):
            pages = range(startPage, endPage+1)
            for i in pages:
                mmlist = self.__getMMList(i)
    
                if not mmlist:
                    print "第%s页无数据
    "%(str(i))
                    break
                for mm in mmlist:
                    self.__saveMM(mm)
                    self.__saveMMAvatar(mm)
                    albumList = self.__getAlbumList(mm)
    
                    for album in albumList:
                        self.__savePics(album, mm)
    
    if  __name__ == '__main__':
        mmspider = MMSpider()
        mmspider.start(2, 3)

     效果:

    保存的图片:

     保存的文本内容:

    源码链接:https://github.com/yunshu2009/pythonspiders/blob/master/Taobaomm/Taobaomm.py 

  • 相关阅读:
    FAN_int2ExcelColChar functions
    How to enables AX email functionality without Outlook
    Global::validateEmail
    Global::time2StrHHMM_DNT
    Global::pickSpecificTable_DNT
    Global::pickSpecificClass_DNT
    Global::pickClassMethod_DNT
    Optimize date2str function
    How to achieve dialog with lookup control
    Optimize str2date function
  • 原文地址:https://www.cnblogs.com/luoyunshu/p/7566027.html
Copyright © 2011-2022 走看看