zoukankan      html  css  js  c++  java
  • pyspider—爬取下载图片

    以第一ppt网站为例:http://www.1ppt.com/

    from pyspider.libs.base_handler import *
    import urllib2,HTMLParser,re
    
    import urllib2,HTMLParser,re
    
    #根url
    host = "http://www.1ppt.com/"
    #本地保存地址
    localSavePath = '/data/girls/'
    #起始图片html地址
    startHtmlUrl = ''
    #图片页Html的地址
    htmlUrlList = []
    #图片Url地址
    imageUrlList = []
    patter = '[0-9]*.jpg';
    #根据得到的图片路径URL将图片下载下来保存本地
    def downloadImage(url):
        print url
        cont = urllib2.urlopen(url).read()
        match = re.search(patter,url);
        if match:
            print '正在下载文件:',match.group()
            filename = localSavePath+match.group()
            f = open(filename,'w+')
            f.write(cont)
            f.close()
        else:
            print 'no match'
    
    #根据首页得到的图片集遍历每个图片集
    def getImageUrlByHtmlUrl(htmlUrl):
        parser = MyHtmlParse(False)
        request = urllib2.Request(htmlUrl)
        try:
            response = urllib2.urlopen(request)
            content = response.read()
            parser.feed(content)
        except urllib2.URLError,e:
            print e.reason
            return
    
    class MyHtmlParse(HTMLParser.HTMLParser):
        def __init__(self,isIndex):
            self.isIndex = isIndex;
            HTMLParser.HTMLParser.__init__(self)
    
        def handle_starttag(self,tag,attrs):
            #print tag
            #print attrs
    
            if(self.isIndex):
                if(tag == 'a'):
                    if(len(attrs) == 3):
                        #print attrs[0]
                        if(attrs[1][0] =='title'):
                            newUrl = host + attrs[0][1]
                            #    print '找到一处图片的网页链接:',newUrl
                            global startHtml
                            startHtmlUrl = newUrl
                            getImageUrlByHtmlUrl(newUrl)
            else:
                #print tag
                if(tag == 'img'):
                    #    print attrs
                    #print attrs[0][0]
                    #print attrs[1][0]
                    if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ):
                        imageUrl = attrs[0][1]
                        match = re.search(patter,imageUrl)
                        if match:
                            print '找到一张图片:',imageUrl
                            downloadImage(imageUrl)
                            imageUrlList.append(imageUrl)    
                            #if (tag == 'a'):       
                            #if (len(attrs) == 4):
                            ##if (attrs[1] == ('class','next')):
                            #nextUrl = host + attrs[2][1]
                            #print '找到一处图片的网页链接:',nextUrl
                            #global startHtmlUrl
                            #if (startHtmlUrl != nextUrl):
                            #getImageUrlByHtmlUrl(nextUrl)
    
    
    #分析首页得到每个图片集的链接
    def parse_url_picture(indexUrl):
        #indexUrl = 'http://desk.zol.com.cn/meinv/'
        #分析首页得到每个图片集的链接
        #indexUrl = 'http://www.1ppt.com'
        m = urllib2.urlopen(indexUrl).read()
        #print m
        parserIndex = MyHtmlParse(True)
        parserIndex.feed(m)
    
    picture_website = r'http://www.1ppt.com/'
    class Handler(BaseHandler):
        crawl_config = {
        }
        
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl(picture_website, callback=self.index_page)
            return
        @config(age= 10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="http"]').items():
                print each.attr.href
                parse_url_picture(each.attr.href)
                self.crawl(each.attr.href, callback=self.detail_page)
            return
        
        @config(priority=2)
        def detail_page(self, response):
            return{
            }

    下面脚本是直接运行(不用放到爬虫平台上):

    #coding: utf-8 #############################################################
    # File Name: girls.py
    # Author: mylonly
    # mail: mylonly@gmail.com
    # Created Time: Mon 09 Jun 2014 09:23:18 PM CST
    #########################################################################
    #!/usr/bin/python
    
    import urllib2,HTMLParser,re
    
    #根url
    host = "http://1ppt.com"
    #本地保存地址
    localSavePath = '/data/girls/'
    #起始图片html地址
    startHtmlUrl = ''
    #图片页Html的地址
    htmlUrlList = []
    #图片Url地址
    imageUrlList = []
    patter = '[0-9]*.jpg';
    #根据得到的图片路径URL将图片下载下来保存本地
    def downloadImage(url):
        print url
        cont = urllib2.urlopen(url).read()
        match = re.search(patter,url);
        if match:
            print '正在下载文件:',match.group()
            filename = localSavePath+match.group()
            f = open(filename,'w+')
            f.write(cont)
            f.close()
        else:
            print 'no match'
    
    #根据首页得到的图片集遍历每个图片集
    def getImageUrlByHtmlUrl(htmlUrl):
        parser = MyHtmlParse(False)
        request = urllib2.Request(htmlUrl)
        try:
            response = urllib2.urlopen(request)
            content = response.read()
            parser.feed(content)
        except urllib2.URLError,e:
            print e.reason
    
    class MyHtmlParse(HTMLParser.HTMLParser):
        def __init__(self,isIndex):
            self.isIndex = isIndex;
            HTMLParser.HTMLParser.__init__(self)
            
        def handle_starttag(self,tag,attrs):
            #print tag
            #print attrs
            
            if(self.isIndex):
                if(tag == 'a'):
                    if(len(attrs) == 3):
                        #print attrs[0]
                        if(attrs[1][0] =='title'):
                            newUrl = host + attrs[0][1]
                        #    print '找到一处图片的网页链接:',newUrl
                            global startHtml
                            startHtmlUrl = newUrl
                            getImageUrlByHtmlUrl(newUrl)
            else:
                #print tag
                if(tag == 'img'):
                #    print attrs
                    print attrs[0][0]
                    print attrs[1][0]
                    if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ):
                        imageUrl = attrs[0][1]
                        match = re.search(patter,imageUrl)
                        if match:
                            print '找到一张图片:',imageUrl
                            downloadImage(imageUrl)
                            imageUrlList.append(imageUrl)    
                #if (tag == 'a'):
                    #if (len(attrs) == 4):
                        ##if (attrs[1] == ('class','next')):
                        #nextUrl = host + attrs[2][1]
                        #print '找到一处图片的网页链接:',nextUrl
                        #global startHtmlUrl
                        #if (startHtmlUrl != nextUrl):
                            #getImageUrlByHtmlUrl(nextUrl)
    #分析首页得到每个图片集的链接
    indexUrl = 'http://www.1ppt.com'
    m = urllib2.urlopen(indexUrl).read()
    #print m
    parserIndex = MyHtmlParse(True)
    parserIndex.feed(m)
  • 相关阅读:
    GO语言并发
    NEERC2017:L
    bzoj2823[AHOI2012]信号塔
    bzoj1336[Balkan2002]Alien最小圆覆盖
    bzoj1069[SCOI2007]最大土地面积
    ACM2017Tsukuba:H
    ACM2015沈阳:B-Bazinga
    bzoj2724[Violet 6]蒲公英
    [bzoj4066]简单题
    [bzoj2125]最短路
  • 原文地址:https://www.cnblogs.com/panliu/p/4849212.html
Copyright © 2011-2022 走看看