#!/usr/bin/env python
#encoding=utf-8
import redis
import urllib2
import time
import StringIO
import gzip
import httplib
import cookielib
httplib.HTTPConnection.debuglevel = 1
files=["12148","12510","15362","11593","11750"]
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_301(
self, req, fp, code, msg, headers)
result.status = code
return result
def http_error_302(self, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_302(
self, req, fp, code, msg, headers)
result.status = code
return result
ckjar = cookielib.MozillaCookieJar()
ckproc = urllib2.HTTPCookieProcessor(ckjar)
count=0
def fetch(k,r1):
try:
request = urllib2.Request(k)
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')
#ckjar = cookielib.MozillaCookieJar(filename)
#ckproc = urllib2.HTTPCookieProcessor(ckjar)
global ckproc
global count
#opener = urllib2.build_opener(ckproc)
opener = urllib2.build_opener(ckproc,SmartRedirectHandler())
f = opener.open(request)
#print f.status
context=f.read()
#要加上agent
#request = urllib2.Request(k)
#request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')
#request.add_header("Accept-encoding", "gzip")
#retval=urllib2.urlopen(request)
#context=""
#
#if retval.headers.has_key('content-encoding'):
# fileobj = StringIO.StringIO()
# fileobj.write(retval.read())
# fileobj.seek(0)
# gzip_file = gzip.GzipFile(fileobj=fileobj)
# context = gzip_file.read()
#else:
# context = retval.read()
html=context.decode("gb18030","ignore").encode("utf-8")
#print html
if len(html.strip())>0:
r1.hset(file,k,html)
count+=1
print "save %s"%count
time.sleep(2)
except urllib2.HTTPError,e:
print "error->"+k
r1.rpush("errors",k)
print str(e)
print e.getcode()
print "rework"
fetch(k,r1)
r1=redis.Redis(db=1)
count=0
for file in files:
dict=r1.hgetall(file)
for k,v in dict.iteritems():
if v=="":
print k
fetch(k,r1)
print "done!"