urllib2.HTTPError: HTTP Error 403: Forbidden
# coding:utf8
__author__ = 'xgqfrms'
__editor__ = 'vscode'
__version__ = '1.0.1'
__copyright__ = """
Copyright (c) 2012-2050, xgqfrms; mailto:xgqfrms@xgqfrms.xyz
"""
# Python 2.7 version
import urllib2
import cookielib
# url = "https://rollbar.com/docs/"
# url = "https://cdn.xgqfrms.xyz/json/"
url = "http://cdn.xgqfrms.xyz/json/"
# urllib2.HTTPError: HTTPs Error 403: Forbidden
# HTTPs Error 403: Forbidden
print '第一种方法'
response1 = urllib2.urlopen(url)
print response1.getcode()
print len(response1.read())
print '第二种方法'
request = urllib2.Request(url)
request.add_header("user-agent", "Mozilla/5.0")
response2 = urllib2.urlopen(request)
print response2.getcode()
print response2.read()
print '第三种方法'
cj = cookielib.CookiJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
response3 = urllib2.urlopen(url)
print response3.getcode()
print cj
print response3.read()
HTTP Header & UA bug
反爬虫策略问题,识别,防御
import urllib2,cookielib
site= "http://www.nseindia.com/live_market/dynaContent/live_watch/get_quote/getHistoricalData.jsp?symbol=JPASSOCIAT&fromDate=1-JAN-2012&toDate=1-AUG-2012&datePeriod=unselected&hiddDwnld=true"
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
content = page.read()
print content
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden
Bot policy
机器人策略
# Python 2
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
con = urllib2.urlopen( req )
print con.read()
# Python 3
import urllib
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
con = urllib.request.urlopen( req )
print(con.read())
try:
f = urllib2.urlopen('http://en.wikipedia.org/wiki/OpenCola_(drink)')
except urllib2.HTTPError, e:
print e.fp.read()
PyCharm 报错提示 too broad exception clauses 的完美解决方案!
except Exception as e:
logging.exception(e)
https://meta.wikimedia.org/wiki/Bot_policy#Unacceptable_usage
https://download.wikimedia.org/
refs
https://www.imooc.com/qadetail/347586
©xgqfrms 2012-2020
www.cnblogs.com 发布文章使用:只允许注册用户才可以访问!