zoukankan      html  css  js  c++  java
  • 豆瓣用户相册批量下载

    很简单的python代码,用BeautifuleSoup会简洁一点,不过就当练习一下正则表达式吧

    # -*- coding:utf-8 -*- 
    #!/usr/bin/env python
    
    __revision__ = '0.1'
    import re
    import os
    import urllib
    import string,time
    def get_html_data(url):
        try:
            html=urllib.urlopen(url)
            data=html.read()
        except:
            data=''
        return data
    def download_all(urls,dir):
        if not os.path.exists(dir):
            os.mkdir(dir)
        for url in urls:
            lo="http://img3.douban.com/view/photo/photo/public/p"+url+".jpg" 
            print lo
            base_name=dir+url+".jpg"
            print(base_name)
            urllib.urlretrieve(lo,base_name)
            time.sleep(1)
    def get_every_pages(url):
        m=re.compile(r"(?<=http://www.douban.com/photos/photo/)[0-9]*(?=/\")",re.IGNORECASE)
        data=get_html_data(url)
        x=m.findall(data)
        return x
    def find_last_page(data):
        m=re.compile(r"(?<=start=)[0-9]*(?=\")")
        list=m.findall(data)
        max=0
        for i in range(len(list)):
            x=string.atoi(list[i])
            if x>max:
                max=x
                
        return max
    def check_next_page(data,i):
        m=re.compile(r"start="+str(i),re.S|re.IGNORECASE)
        x=m.findall(data)
        if len(x)==0:
            return False
        else:
            return True
    def get_photo(url,dir):
        i=1
        photo_list=[]
        data=get_html_data(url)
        lists=get_every_pages(url)
        photo_list.append(lists)
        last=find_last_page(data)
    #    while check_next_page(data,18*i)==True:
        while last-18*i>=0:
            print url+"?start="+str(18*i)
            lists=get_every_pages(url+"?start="+str(18*i))
            #print i
            print last
            photo_list.append(lists)
            i=i+1
        photos=[]
        for p in photo_list:
            if len(p)>1:
                for pp in p:
                    photos.append(pp)
            else:
                photos.append(p)
        #print photos
        #print len(photos)
        download_all(photos,dir)
    if __name__=="__main__":
        print "input the url:"
        url=raw_input()
        print "input the dir name:"
        dir=raw_input()
        get_photo(url,dir)
  • 相关阅读:
    CentOS7 FTP安装与配置
    linux CentOS 安装 nginx
    linux CentOS YUM 安装 nginx+tomcat+java+mysql运行环境
    Node.js 开发
    Nginx 负载均衡
    BtxCMS.Net 项目
    不得不看!史上最全的三十多张架构师图谱!
    高危群体:开发者的自白,躲坑,迷茫,和下一步
    p2p-如何拯救k8s镜像分发的阿喀琉斯之踵
    Tower与DevCloud对比分析报告
  • 原文地址:https://www.cnblogs.com/long0x0/p/2850905.html
Copyright © 2011-2022 走看看