zoukankan      html  css  js  c++  java
  • python3 爬虫入门

    import urllib.request;
    import urllib.parse;
    
    url = "http://www.iciba.com/publish";
    
    headers = {
    	"Host" : "www.iciba.com",
    	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    	"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    	"Accept-Language" : "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    	#"Accept-Encoding" : "gzip, deflate"
    };
    
    
    request = urllib.request.Request(url=url,headers=headers);
    
    response = urllib.request.urlopen(request);
    
    print(response.read().decode());
    

    报错:

    UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

    【解决之道】没有进行解压缩处理

    import urllib.request;
    import urllib.parse;
    import gzip;
    
    
    
    
    url = "https://www.baidu.com";
    headers = {
    	"Host" : "www.baidu.com",
    	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    	"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    	"Accept-Language" : "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    	"Accept-Encoding" : "gzip, deflate"
    };
    
    
    request = urllib.request.Request(url=url,headers=headers);
    
    response = urllib.request.urlopen(request);
    
    
    content = response.read();
    '''
    获取响应信息
    '''
    encoding = response.info().get("Content-Encoding");
    
    
    if(encoding == "gzip"):
    	print(gzip.decompress(content).decode());
    
  • 相关阅读:
    Java 对文件的操作
    快速排序算法
    Java 时间和字符换的处理
    Redis 数据结构之Keys
    [转] Redis系统性介绍
    【转】JAVA 接口
    [转] Python 代码性能优化技巧
    几道关于面试的题目
    随手笔记2
    随手笔记
  • 原文地址:https://www.cnblogs.com/liwuming/p/10851045.html
Copyright © 2011-2022 走看看