zoukankan      html  css  js  c++  java
  • python3 爬虫入门

    import urllib.request;
    import urllib.parse;
    
    url = "http://www.iciba.com/publish";
    
    headers = {
    	"Host" : "www.iciba.com",
    	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    	"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    	"Accept-Language" : "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    	#"Accept-Encoding" : "gzip, deflate"
    };
    
    
    request = urllib.request.Request(url=url,headers=headers);
    
    response = urllib.request.urlopen(request);
    
    print(response.read().decode());
    

    报错:

    UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

    【解决之道】没有进行解压缩处理

    import urllib.request;
    import urllib.parse;
    import gzip;
    
    
    
    
    url = "https://www.baidu.com";
    headers = {
    	"Host" : "www.baidu.com",
    	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    	"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    	"Accept-Language" : "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    	"Accept-Encoding" : "gzip, deflate"
    };
    
    
    request = urllib.request.Request(url=url,headers=headers);
    
    response = urllib.request.urlopen(request);
    
    
    content = response.read();
    '''
    获取响应信息
    '''
    encoding = response.info().get("Content-Encoding");
    
    
    if(encoding == "gzip"):
    	print(gzip.decompress(content).decode());
    
  • 相关阅读:
    《老男孩》
    java中四种阶乘的计算
    DataOutputStream的乱码问题
    java中飞clone方法
    类反射的简单例子
    PHP: 如何连接oracle数据库进行数据读取
    .NET : 如何读取图片中的元数据信息
    重新审视REST
    PHP 的历史
    Image File Format Specifications {转载}
  • 原文地址:https://www.cnblogs.com/liwuming/p/10851045.html
Copyright © 2011-2022 走看看