  • 爬取淘宝模特信息并自动保存图片

    环境:Ubuntu 16.04

    工具:python 3.5+,scrapy1.1,pycharm

    import scrapy, re, os, lxml, urllib.request
    from scrapy.http import Request
    from bs4 import BeautifulSoup
    class TaobaoMMSpider(scrapy.Spider):
        name = 'TaobaoMM'
        start_urls = ['https://mm.taobao.com/json/request_top_list.htm?page=1']
        mainposition = '/media/liuyu/0009F608000B7B40/TaobaoMM/'
        # 处理第一个网页,获取总页数
        def parse(self, response):
            content = BeautifulSoup(response.text, "lxml")
            totalpage = content.find('input', id="J_Totalpage").get('value')
            url = 'https://mm.taobao.com/json/request_top_list.htm?page='
            for i in range(1):
                yield Request(url + str(i+1), callback=self.everypage)
        # 对每一页的网页进行处理,获取每位model的网页
        def everypage(self, response):
            content = BeautifulSoup(response.text, "lxml")
            modelinfo = content.find_all('div', class_="personal-info")
            for i in modelinfo:
                name = i.find('a', class_="lady-name").string
                seconddir = self.mainposition + name
                os.mkdir(self.mainposition + str(name))
                age = i.find('strong').string
                modelurl = 'https:' + i.find('a', class_="lady-name").get('href')
                yield Request(modelurl, callback=self.infocard, meta={'age': age, 'seconddir': seconddir})
        # 处理模特卡界面,获取模特id,构造获取model信息的json链接
        def infocard(self, response):
            content = BeautifulSoup(response.text, "lxml")
            modelid = content.find('input', id="J_MmuserId").get('value')
            infourl = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id=' + modelid
            albumurl = 'https:' + content.find('ul', class_="mm-p-menu").find('a').get('href')
            yield Request(infourl, callback=self.infoprocess,
                          meta={'seconddir': response.meta['seconddir'], 'albumurl': albumurl, 'age': response.meta['age']})
        # 处理model的json网页信息,获取名字等信息,然后跳转至相册界面
        def infoprocess(self, response):
            seconddir = response.meta['seconddir']
            albumurl = response.meta['albumurl']
            age = response.meta['age']
            content = BeautifulSoup(response.text, "lxml")
            modelinfo = content.find('ul', class_="mm-p-info-cell clearfix")
            info = modelinfo.find_all('li')
            name = info[0].find('span').string
            with open(seconddir + '/' + name + '.txt', 'w')as file:
                file.write('age' + age + '
                for i in range(6):
                    file.write(info[i].find('span').string.replace("xa0", "") + '
                for i in range(2):
                    file.write(info[i+7].find('p').string + '
                file.write('BWH:  ' + info[9].find('p').string + '
                file.write('cup_size:  ' + info[10].find('p').string + '
                file.write('shoe_size:  ' + info[11].find('p').string + '
            yield Request(albumurl, callback=self.album, meta={'seconddir': response.meta['seconddir']})
        # 处理相册框架界面,获取model的ID,构造相册列表的json请求链接
        def album(self, response):
            content = BeautifulSoup(response.text, "lxml")
            modelid = content.find('input', id="J_userID").get('value')
            url = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=' + modelid
            yield Request(url, callback=self.allimage, meta={'url': url, 'seconddir': response.meta['seconddir']})
        # 处理相册信息页面,获取总页数
        def allimage(self, response):
            url = response.meta['url']
            content = BeautifulSoup(response.text, "lxml")
            page = content.find('input').get('value')
            for i in range(int(page)):
                yield Request(url + '&page=' + str(i+1), callback=self.image, meta={'seconddir': response.meta['seconddir']})
        # 对相册每一页进行处理,获取相册名,对每一个相册进行访问
        def image(self, response):
            seconddir = response.meta['seconddir']
            content = BeautifulSoup(response.text,"lxml")
            albuminfo = content.find_all('div', class_="mm-photo-cell-middle")
            for i in albuminfo:
                albumname = i.find('h4').a.string.replace(" ","")
                thirddir = seconddir + '/' + albumname
                url = i.find('h4').a.get('href')
                pattern = re.compile('.*?user_id=(.*?)&album_id=(.*?)&album_flag')
                item = re.findall(pattern, url)
                for item in item:
                    modelid = item[0]
                    albumid = item[1]
                imageurl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=' + modelid + '&album_id=' + albumid + '&page='
                yield Request(imageurl, callback=self.imageprocess, meta={'url': imageurl, 'thirddir': thirddir})
        # 对相册页面进行处理,获取相册总页数
        def imageprocess(self, response):
            url = response.meta['url']
            content = response.text
            pattern = re.compile('.*?"totalPage":"(.*?)"')
            item = re.findall(pattern, content)
            pagenum = item[0]
            for i in range(int(pagenum)):
                imageurl = url + str(i+1)
                yield Request(imageurl, callback=self.saveimage, meta={'thirddir': response.meta['thirddir']})
        # 处理相册页面,获得每一个照片的链接
        def saveimage(self, response):
            thirddir = response.meta['thirddir']
            content = response.text
            pattern = re.compile('.*?"picUrl":"(.*?)"')
            pattern_2 = re.compile('.*?imgextra/.*?/(.*?)/')
            imageurls = re.findall(pattern, content)
            for imageurl in imageurls:
                url = 'https:' + imageurl
                u = urllib.request.urlopen(url).read()
                with open(thirddir + '/' + imagename + '.jpg', 'wb')as file:


