zoukankan html css js c++ java

1.4.2python网站地图爬虫（每天一更）

# -*- coding: utf-8 -*-
'''
Created on 2019年5月6日

@author: 薛卫卫
'''

import urllib.request
import re

def download(url, user_agent="wswp",num_retries=2):
    print("Downloading: " , url)
    headers = { 'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.request.URLError as e:
        print('Download error:' , e.reason)
        html = None
        if num_retries > 0 :
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # 不修改正则表达式，修改输出的结果，将urlopen().read()返回的data进行解码
    sitemap = sitemap.decode('utf-8')
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    #download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...
        
crawl_sitemap("http://example.webscraping.com/sitemap.xml")

查看全文

相关阅读:
open文件操作之mode模式剖析
 super函数没有那么简单-super原理剖析
 mac下将python2.7改为python3
《图解http》构建Web内容的技术
 《图解HTTP》Web的攻击技术
 java获取服务器信息
 免费的论文查重网站
 gitlab持续集成，自动部署
 mybatis-plus invalid bound statement (not found) insert解决办法
 锁原理：偏向锁、轻量锁、重量锁

原文地址：https://www.cnblogs.com/xww115/p/10828446.html