zoukankan      html  css  js  c++  java
  • Python爬虫 —— 抓取美女图片

    代码如下:

    #coding:utf-8
    # import datetime
    import requests
    import os
    import sys
    from lxml import etree
    import codecs
    
    class Spider:
        def __init__(self):
            self.headers = {}
            self.headers['User_Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
            self.headers['Referer'] = 'http://www.mzitu.com/all/'
    
        def crawl(self, root_url):
            html_text = requests.get(root_url,headers=self.headers).text
            html_tree = etree.HTML(html_text)
            groups = html_tree.xpath("//div[@class='main-content']//ul[@class='archives']//a")
            count = 0
            print "开始抓取:"
            for group in groups:
                title = group.text
                groupUrl = group.get('href')
                print "正在抓取组图:"+title
                dirpath = self.makDir(title)   #获取标题,并以标题为名字创建文件夹
                self.getGroup(groupUrl,dirpath)    #
                count = count+1
                if count>=5:
                    print "抓取完成……"
                    os._exit(0)
    
        def makDir(self,dirname):
            dirpath = os.path.join(u'E:学习资料',dirname)
            if not os.path.exists(dirpath):
                os.makedirs(dirpath)
            return dirpath
    
        def getGroup(self,groupUrl,dirpath):
            self.headers['Referer'] = groupUrl
            html_text = requests.get(groupUrl, headers=self.headers).text
            html_tree = etree.HTML(html_text)
            maxPage = html_tree.xpath("//div[@class='pagenavi']//span")[-2].text    #获取改组图的张数
            for page in range(1,int(maxPage)+1):    #获取每一张图的所在页面
                pageUrl = groupUrl + '/' + str(page)    #拼接页面url
                self.getPage(pageUrl,page,dirpath)   #访问该页面
    
        def getPage(self, pageUrl,page,dirpath):
            self.headers['Referer'] = pageUrl
            page_text = requests.get(pageUrl, headers=self.headers).text  #请求该图所在的页面
            page_tree = etree.HTML(page_text)
            imageurl = page_tree.xpath("//div[@class='main-image']//img")[0].get('src')    #获取图片url
            image = requests.get(imageurl, headers=self.headers).content                    #请求获取图片
            self.saveImage(image,page,dirpath)
    
        def saveImage(self,image,page,dirpath):
            imagepath = os.path.join(dirpath, str(page) + u'.jpg')
            file = codecs.open(imagepath, 'wb')
            file.write(image)
            file.close()
    
    if __name__ == '__main__':
        reload(sys)
        sys.setdefaultencoding('utf-8')
        Mzitu = Spider()
        Mzitu.crawl('http://www.mzitu.com/all')
  • 相关阅读:
    ElasticSearch入门 第一篇:Windows下安装ElasticSearch
    怎样打开.jar格式文件,怎样运行.jar格式文件
    如何安装java环境和如何配置java环境
    JDK安装、java环境配置
    学习PHP好,还是Python好呢?
    艾伟:WPF简介 狼人:
    艾伟:ASP.NET MVC,深入浅出IModelBinder,在Post方式下慎用HtmlHelper 狼人:
    艾伟:在Mono平台开发前你应该知道 狼人:
    艾伟:一个较完整的关键字过滤解决方案(上) 狼人:
    艾伟:闲话WPF之一(WPF的结构) 狼人:
  • 原文地址:https://www.cnblogs.com/DOLFAMINGO/p/9166435.html
Copyright © 2011-2022 走看看