zoukankan      html  css  js  c++  java
  • python3 爬虫入门

    import urllib.request;
    import urllib.parse;
    
    url = "http://www.iciba.com/publish";
    
    headers = {
    	"Host" : "www.iciba.com",
    	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    	"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    	"Accept-Language" : "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    	#"Accept-Encoding" : "gzip, deflate"
    };
    
    
    request = urllib.request.Request(url=url,headers=headers);
    
    response = urllib.request.urlopen(request);
    
    print(response.read().decode());
    

    报错:

    UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

    【解决之道】没有进行解压缩处理

    import urllib.request;
    import urllib.parse;
    import gzip;
    
    
    
    
    url = "https://www.baidu.com";
    headers = {
    	"Host" : "www.baidu.com",
    	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    	"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    	"Accept-Language" : "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    	"Accept-Encoding" : "gzip, deflate"
    };
    
    
    request = urllib.request.Request(url=url,headers=headers);
    
    response = urllib.request.urlopen(request);
    
    
    content = response.read();
    '''
    获取响应信息
    '''
    encoding = response.info().get("Content-Encoding");
    
    
    if(encoding == "gzip"):
    	print(gzip.decompress(content).decode());
    
  • 相关阅读:
    HTML5程序设计--SVG
    visual studio 2012 Github
    排序算法--鸡尾酒排序
    排序算法--归并排序
    排序算法--冒泡排序
    排序算法---插入排序
    外语学习的真实方法及误区
    学习新东西的唯一方法
    如何做好一个面试官——之学习篇
    求职者和面试官如何做好电话面试
  • 原文地址:https://www.cnblogs.com/liwuming/p/10851045.html
Copyright © 2011-2022 走看看