zoukankan      html  css  js  c++  java
  • Python爬虫(三)爬淘宝MM图片

    直接上代码:

    # python2
    # -*- coding: utf-8 -*-
    
    import urllib2
    import re
    import string
    import os
    import shutil
    
    def crawl_taobaoMM(baseUrl, start, end):
        imgDir = 'mm_img'
        isImgDirExist = os.path.exists(imgDir)
        if not isImgDirExist:
            os.makedirs(imgDir)
        else:
            shutil.rmtree(imgDir)
    
        fileName = 'mm.txt'
        picNumber = 0
        with open(fileName, 'a') as f:
            for i in range(start, end + 1):
                url = baseUrl + '?page=' + str(i)
                userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)' 
                            ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
                headers = {'user-agent': userAgent}
                req = urllib2.Request(url, headers=headers)
                response = urllib2.urlopen(req).read().decode('gbk')
                # 图片url、姓名、年龄、城市、职业
                serchPattern = r'<div class="personal-info">.*?<img src="//(.*?)".*?<a class="lady-name".*?>(.*?)' 
                               r'</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>.*?<em>(.*?)</em>'
                searchObj = re.compile(serchPattern, re.S)
                results = searchObj.findall(response)
    
                print '' + str(i) + '页...'
                for result in results:
                    message = '%s %s %s %s %s
    ' % (result[0], result[1], result[2], result[3], result[4])
                    print picNumber
                    print message
                    f.write(message.encode('utf-8'))
                    pic = urllib2.urlopen('https://' + result[0]).read()
                    picName = imgDir + '/' + string.zfill(picNumber, 5) + '.jpg'
                    with open(picName, 'wb') as pf:
                        pf.write(pic)
                    picNumber += 1
    
    crawl_taobaoMM('https://mm.taobao.com/json/request_top_list.htm', 1, 10)

    爬下来的图片:

    参考资料:

    Python爬虫实战四之抓取淘宝MM照片

  • 相关阅读:
    unity-TextAsset
    unity-热更-InjectFix
    进程和线程的区别
    StringBuffer 和 StringBuilder 的区别
    List、Set、Map 三者区别
    竞态条件是什么?
    多线程安全(synchronized、三大特性、生命周期以及优缺点)
    集合的同步与非同步
    List、Set、Map的了解及区别
    java面试题
  • 原文地址:https://www.cnblogs.com/gattaca/p/6930592.html
Copyright © 2011-2022 走看看