import urllib2 def downloadHtml(url,user_agent=None,num_retries=2): print 'Downloading:',url headers={'User-agent':user_agent} req=urllib2.Request(url,headers=headers) try: html=urllib2.urlopen(req).read() except urllib2.URLError as e: print 'Download error:',e.reason html=None if num_retries>0: if hasattr(e,'code') and 500<=e.code<600: return downloadHtml(url,user_agent,num_retries-1) return html
def download_id():##根据连续页码下载若连续5次出错停止下载 max_count=5 error_count=0 for i in itertools.count(1): url='http://xxxx/%s'%i html=download(url) if html is None: error_count+=1 if error_count==max_count: break else: error_count=0 def get_links(html): reg=re.compile(r'',re.S) return reg.findall(html) def link_crawler(seed_url, link_regex): crawl_queue=[seed_url] seen=set(crawl_queue) while crawl_queue: url=crawl_queue.pop() html=download(url) for link in get_links(html): if re.match(link_regex,link): link=urlparse.urljoin(seed_url,link) if link not in seen: seen.add(link) crawl_queue.append(link)