zoukankan      html  css  js  c++  java
  • python爬某个网站的图片

    # _*_ coding: gbk _*_
    import urllib
    import urllib2
    import re
    class Spider:
        
        def getImage(self,html):
            request=urllib2.Request(html);
            page=urllib2.urlopen(html);
            html=page.read();        
            pattern=r'src="http:.*.jpg'
            imglist=re.findall(pattern,html);
            cnt=0
            for i in imglist:
                print i[5:];
                urllib.urlretrieve(i[5:], 'E:\images\%s.jpg' % cnt);
                cnt+=1
                if cnt==2:
                    break;
            
            print 'the end'
            
    
    if __name__=="__main__":
        print 'hello'
        s=Spider();
        #html=r"http://baike.baidu.com/link?url=pj6QaA2Zyrxx2WcD4f7vN50LWVIZjJUKYdnnLGMOWnmInlALGH4dXmU86hE3Ar-jmaiahjf2MiEZ3n_0WCOUlFuKwVfYZNKnBwxidD1cC3i";
        html=r"http://baike.baidu.com/link?url=rHaKx7RPBWuR4MxzY0BPhwbLKH4DEdwKPN8EYH-78Zzm7IMUuFTYM0eUZw-j27lHxDxyyNiqkjUg4JG8FvyjNUsuqiTzLixsNSXUtTWiOpQqrtxbf4hkj-n6gF1Nyn4D"
        s.getImage(html);
    

      

    python从某个网站上面爬很多图片的url,主要是从百度风云榜上面爬的,男演员,女演员,男歌手,女歌手,总共200张

    # _*_ coding: gbk _*_
    import urllib
    import urllib2
    import re
    import os
    class Spider:
        
        def getImage(self,html):
            request=urllib2.Request(html);
            page=urllib2.urlopen(html);
            html=page.read();        
            pattern=r'href="http:.*简介'
            imglist=re.findall(pattern,html);
            with open(r'e:\images\paths.txt','w+') as f:
                for i in imglist:
                    print i[6:len(i)-6];
                    f.write(i[6:len(i)-6]);
                    f.write('
    ')
            print len(imglist)
            print 'the end'  
    
    if __name__=="__main__":
        print 'hello'
        s=Spider();
        #html=r"http://baike.baidu.com/link?url=pj6QaA2Zyrxx2WcD4f7vN50LWVIZjJUKYdnnLGMOWnmInlALGH4dXmU86hE3Ar-jmaiahjf2MiEZ3n_0WCOUlFuKwVfYZNKnBwxidD1cC3i";
        html=r'http://top.baidu.com/buzz?b=18&qq-pf-to=pcqq.group'
        s.getImage(html);
       
    

      

  • 相关阅读:
    Ubuntu上64位adv无法创建问题
    Java 数据结构与算法分析学习
    博客第一天——新的梦幻之旅
    android开发第一天
    Ubuntu下OpenGL开发环境的搭建
    [ZZ]WindowsForm应用程序调用WebService
    WindowsForm应用程序调用WebService
    Hello World
    Apache Ant编写build.xml的自动提示 ANT DTD
    Serial Interface之I2C:关于DS1624 2线通信SDA保持时间的说明
  • 原文地址:https://www.cnblogs.com/wuxiangli/p/6099334.html
Copyright © 2011-2022 走看看