zoukankan      html  css  js  c++  java
  • batch_get_real_image_urls.py 博源

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.09
    v0.1
    获取图片真实的URL,之前做了Referer和302跳转
    """
    import socket as original_socket
    original_socket.setdefaulttimeout(10)
    import sys
    reload(sys)
    sys.setdefaultencoding(sys.stdout.encoding)
    from functools import wraps
    from pyquery import PyQuery as pq
    import os
    import time
    import glob
    import eventlet
    from eventlet import pools
    from eventlet.timeout import with_timeout
    from eventlet.green import urllib2
    g_data_folder           = os.path.join("./","images")
    g_infos_folder          = os.path.join("./","images_infos")
    g_error_file            = os.path.join("./","images_infos/error.txt")
    g_success_file          = os.path.join("./","images_infos/success.txt")
    g_xmls_folder           = os.path.join("./","xmls/")
    g_filter                = os.path.join(g_xmls_folder,"*.xml")
    str="""
    Host: www.215588.com
    User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
    Accept-Language: en-us,en;q=0.5
    Accept-Encoding: gzip,deflate
    Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
    Keep-Alive: 115
    Connection: keep-alive
    Referer: http://www.215588.com/gouwu/showproduct.asp?id=592
    Cookie: RecentlyGoods=508%2C184%2C592%2C; ASPSESSIONIDACBTTQQD=KHGEPHICJFEOEEPIGAJJNKHI; AJSTAT_ok_times=2; ASPSESSIONIDACCQSQQC=CBKBIEFDAJDHKEMIDIKFMPNM; AJSTAT_ok_pages=1
    """
    str="""
    Host: www.215588.com
    User-Agent: Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.2.12pre) Gecko/20101005 Ubuntu/10.04 (lucid) Namoroka/3.6.12pre
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
    Accept-Language: zh-cn,zh;q=0.5
    Accept-Encoding: gzip,deflate
    Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7
    Keep-Alive: 115
    Connection: keep-alive
    Referer: http://www.215588.com/gouwu/showroom.asp
    Cookie: ftwww215588com=0; AJSTAT_ok_times=8; RecentlyGoods=592%2C; ASPSESSIONIDACCQSQQC=BBJGIEFDJEBFDGJLGEOPFIDF; AJSTAT_ok_pages=3
    """
    str=str.strip().replace("\r\n","\n")
    headers={}
    for elem in str.split("\n"):
        a,b=elem.split(":",1)
        a=a.strip()
        b=b.strip()
        headers[a]=b
        #print a,b
    #print headers
    def init():
        if not os.path.exists(g_infos_folder):
            os.makedirs(g_infos_folder)
        if not os.path.exists(g_data_folder):
            os.makedirs(g_data_folder)
    def clear():
        "清理生成的数据"
        delete(g_data_folder)
        delete(g_infos_folder)
        print "还原为初始"
    def error(url):
        f=open(g_error_file,"a")
        f.write("%s\n"%(url,))
        f.close()
    def success(url):
        f=open(g_success_file,"a")
        f.write("%s\n"%url)
        f.close()
    def delete(src):
        '''delete files and folders'''
        if os.path.isfile(src):
            try:
                os.remove(src)
            except:
                pass
        elif os.path.isdir(src):
            for item in os.listdir(src):
                itemsrc=os.path.join(src,item)
                delete(itemsrc) 
            try:
                os.rmdir(src)
            except:
                pass
    def statistics(f):
        def tongji():
            total,successed=0,0
            if os.path.exists(g_xmls_folder):
                total=len(set(glob.glob(g_filter)))
                print "total lines:%s"%total
            if os.path.exists(g_success_file):
                successed=len(set(open(g_success_file,"r").read().strip().split('\n')))
                print "successed lines:%s"%successed
            print "left lines:%s"%(total-successed)
        @wraps(f)
        def wrapper(*args,**args2):
            tongji()
            time.sleep(3)
            back = f(*args, **args2)
            tongji()
            return back
        return wrapper
    def cost_time(f):
        @wraps(f)
        def wrapper(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), f.__name__)
            back = f(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), f.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, f.__name__)
            return back
        return wrapper
    def get_real_image_url(file):
        s=pq(open(file,"r").read())
        url=s("field[@name='%s']"%"bigImage").text()
        def do(url):
            try:
                req = urllib2.Request(url = url,headers = headers)
                img = urllib2.urlopen(req)
                return img.url
            except Exception,ex:
                error(url,ex)
                return None
        rr = with_timeout(10, do, url, timeout_value=None)
        if rr is not None:
            s("field[@name='%s']"%"bigImage").text(rr)
            r='<?xml version="1.0" encoding="utf-8"?>'+s.wrap("<add></add>").html()
            new_file=os.path.join(g_data_folder,os.path.basename(file))
            open(new_file,"w").write(r)
            success(file)
            print "success",url
        else:
            error(file)
            print "error",url
    @cost_time
    @statistics
    def batch_get_real_image_urls(files=glob.glob(g_filter)):
        """获取图片真实链接"""
        pool=eventlet.GreenPool(20)
        for file in files:
            pool.spawn_n(get_real_image_url,file)
        pool.waitall()
    @cost_time
    @statistics
    def process_continue():
        "接着success抓取剩下的部分"
        #读取完整的部分和已完成的部分进行取非交集合
        done,all=set(),set()
        if os.path.exists(g_success_file):
            done=set(open(g_success_file,"r").read().strip().split("\n"))
        if os.path.exists(g_xmls_folder):    
            all=set(glob.glob(g_filter))
        left=all-done
        batch_get_real_image_urls(left)
    def count():
        total,successed=0,0
        if os.path.exists(g_xmls_folder):
            total=len(set(glob.glob(g_filter)))
            print "total lines:%s"%total
        if os.path.exists(g_success_file):
            successed=len(set(open(g_success_file,"r").read().strip().split('\n')))
            print "successed lines:%s"%successed
        print "left lines:%s"%(total-successed)
        return total,successed
        
    def process_forever():
        "循环处理,直到全部完成"
        total,successed=count()
        while (total-successed)>0:
            process_continue()
            total,successed=count()
            
    if __name__=="__main__":
        init()
        l=len(sys.argv)
        dict = {
                "batch"     :batch_get_real_image_urls,
                "continue"  :process_continue,
                "clear"     :clear,
                "loop"      :process_forever,
               }
        if l==2:
            argv=sys.argv[1].strip()
            if argv in dict:
                dict[argv]()
        else:
            for k,v in dict.iteritems():
                print k,v.__doc__
        print "done!"
  • 相关阅读:
    电脑能ping127.0.0.1但是ping不通本机ip
    用iis调试源代码
    pl登录提示服务不存在
    sqlserver保留一位小数(不是四舍五入)
    web应用程序与web网站发布时区别
    java的覆盖重写隐藏和C#中的不同
    导出word
    点击登录提交两次的问题
    oracle通过plsql代码倒库
    apply方法自解
  • 原文地址:https://www.cnblogs.com/lexus/p/1846562.html
Copyright © 2011-2022 走看看