# -*- coding: utf-8 -*- ''' Created on 2019年5月6日 @author: 薛卫卫 ''' import urllib.request import re def download(url, user_agent="wswp",num_retries=2): print("Downloading: " , url) headers = { 'User-agent': user_agent} request = urllib.request.Request(url, headers=headers) try: html = urllib.request.urlopen(request).read() except urllib.request.URLError as e: print('Download error:' , e.reason) html = None if num_retries > 0 : if hasattr(e, 'code') and 500 <= e.code < 600: return download(url, user_agent, num_retries-1) return html def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # 不修改正则表达式,修改输出的结果,将urlopen().read()返回的data进行解码 sitemap = sitemap.decode('utf-8') # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) #download each link for link in links: html = download(link) # scrape html here # ... crawl_sitemap("http://example.webscraping.com/sitemap.xml")