#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.09
v0.1
获取图片真实的URL,之前做了Referer和302跳转
"""
import socket as original_socket
original_socket.setdefaulttimeout(10)
import sys
reload(sys)
sys.setdefaultencoding(sys.stdout.encoding)
from functools import wraps
from pyquery import PyQuery as pq
import os
import time
import glob
import eventlet
from eventlet import pools
from eventlet.timeout import with_timeout
from eventlet.green import urllib2
g_data_folder = os.path.join("./","images")
g_infos_folder = os.path.join("./","images_infos")
g_error_file = os.path.join("./","images_infos/error.txt")
g_success_file = os.path.join("./","images_infos/success.txt")
g_xmls_folder = os.path.join("./","xmls/")
g_filter = os.path.join(g_xmls_folder,"*.xml")
str="""
Host: www.215588.com
User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-us,en;q=0.5
Accept-Encoding: gzip,deflate
Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
Keep-Alive: 115
Connection: keep-alive
Referer: http://www.215588.com/gouwu/showproduct.asp?id=592
Cookie: RecentlyGoods=508%2C184%2C592%2C; ASPSESSIONIDACBTTQQD=KHGEPHICJFEOEEPIGAJJNKHI; AJSTAT_ok_times=2; ASPSESSIONIDACCQSQQC=CBKBIEFDAJDHKEMIDIKFMPNM; AJSTAT_ok_pages=1
"""
str="""
Host: www.215588.com
User-Agent: Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.2.12pre) Gecko/20101005 Ubuntu/10.04 (lucid) Namoroka/3.6.12pre
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-cn,zh;q=0.5
Accept-Encoding: gzip,deflate
Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7
Keep-Alive: 115
Connection: keep-alive
Referer: http://www.215588.com/gouwu/showroom.asp
Cookie: ftwww215588com=0; AJSTAT_ok_times=8; RecentlyGoods=592%2C; ASPSESSIONIDACCQSQQC=BBJGIEFDJEBFDGJLGEOPFIDF; AJSTAT_ok_pages=3
"""
str=str.strip().replace("\r\n","\n")
headers={}
for elem in str.split("\n"):
a,b=elem.split(":",1)
a=a.strip()
b=b.strip()
headers[a]=b
#print a,b
#print headers
def init():
if not os.path.exists(g_infos_folder):
os.makedirs(g_infos_folder)
if not os.path.exists(g_data_folder):
os.makedirs(g_data_folder)
def clear():
"清理生成的数据"
delete(g_data_folder)
delete(g_infos_folder)
print "还原为初始"
def error(url):
f=open(g_error_file,"a")
f.write("%s\n"%(url,))
f.close()
def success(url):
f=open(g_success_file,"a")
f.write("%s\n"%url)
f.close()
def delete(src):
'''delete files and folders'''
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass
def statistics(f):
def tongji():
total,successed=0,0
if os.path.exists(g_xmls_folder):
total=len(set(glob.glob(g_filter)))
print "total lines:%s"%total
if os.path.exists(g_success_file):
successed=len(set(open(g_success_file,"r").read().strip().split('\n')))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
@wraps(f)
def wrapper(*args,**args2):
tongji()
time.sleep(3)
back = f(*args, **args2)
tongji()
return back
return wrapper
def cost_time(f):
@wraps(f)
def wrapper(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), f.__name__)
back = f(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), f.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, f.__name__)
return back
return wrapper
def get_real_image_url(file):
s=pq(open(file,"r").read())
url=s("field[@name='%s']"%"bigImage").text()
def do(url):
try:
req = urllib2.Request(url = url,headers = headers)
img = urllib2.urlopen(req)
return img.url
except Exception,ex:
error(url,ex)
return None
rr = with_timeout(10, do, url, timeout_value=None)
if rr is not None:
s("field[@name='%s']"%"bigImage").text(rr)
r='<?xml version="1.0" encoding="utf-8"?>'+s.wrap("<add></add>").html()
new_file=os.path.join(g_data_folder,os.path.basename(file))
open(new_file,"w").write(r)
success(file)
print "success",url
else:
error(file)
print "error",url
@cost_time
@statistics
def batch_get_real_image_urls(files=glob.glob(g_filter)):
"""获取图片真实链接"""
pool=eventlet.GreenPool(20)
for file in files:
pool.spawn_n(get_real_image_url,file)
pool.waitall()
@cost_time
@statistics
def process_continue():
"接着success抓取剩下的部分"
#读取完整的部分和已完成的部分进行取非交集合
done,all=set(),set()
if os.path.exists(g_success_file):
done=set(open(g_success_file,"r").read().strip().split("\n"))
if os.path.exists(g_xmls_folder):
all=set(glob.glob(g_filter))
left=all-done
batch_get_real_image_urls(left)
def count():
total,successed=0,0
if os.path.exists(g_xmls_folder):
total=len(set(glob.glob(g_filter)))
print "total lines:%s"%total
if os.path.exists(g_success_file):
successed=len(set(open(g_success_file,"r").read().strip().split('\n')))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
return total,successed
def process_forever():
"循环处理,直到全部完成"
total,successed=count()
while (total-successed)>0:
process_continue()
total,successed=count()
if __name__=="__main__":
init()
l=len(sys.argv)
dict = {
"batch" :batch_get_real_image_urls,
"continue" :process_continue,
"clear" :clear,
"loop" :process_forever,
}
if l==2:
argv=sys.argv[1].strip()
if argv in dict:
dict[argv]()
else:
for k,v in dict.iteritems():
print k,v.__doc__
print "done!"