zoukankan      html  css  js  c++  java
  • 高度伪装浏览器封装类

    1.高度伪装封装类HeadersHelper.py

     1 import urllib.request
     2 import http.cookiejar
     3 
     4 class HeadersHelper:
     5     def __init__(self, url, path=None):
     6         self.url = urllib.request.quote(url,safe='/:?=', encoding='utf-8')
     7         self.path = path
     8 
     9     # 设置信息头,高度仿照浏览器
    10     def set_Headers(self):
    11         # 添加报头 注意"Accept-Encoding": "gb2312, utf-8" 防止解码而出现乱码
    12         headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    13                    "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    14                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
    15                    "Connection": "keep-alive", "Host": "baidu.com"
    16                    }
    17         cjar = http.cookiejar.CookieJar()
    18         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
    19         headall = []
    20         for key, value in headers.items():
    21             item = (key, value)
    22             headall.append(item)
    23         opener.addheaders = headall
    24         urllib.request.install_opener(opener)
    25 
    26     # 信息返回
    27     def feedbak_info(self):
    28         self.set_Headers()
    29         # 有时候用utf-8,有时候用gbk
    30         # http://fjrs168.blog.hexun.com 就需要gbk
    31         try:
    32             info = urllib.request.urlopen(self.url).read().decode('utf-8')
    33         except:
    34             info = urllib.request.urlopen(self.url).read().decode('gbk')
    35         return str(info)
    36 
    37     # 信息存档
    38     def save_InFile(self):
    39         self.set_Headers()
    40         info = urllib.request.urlopen(self.url).read()
    41         file = open(self.path, 'wb')
    42         file.write(info)
    43         file.close()

    2.测试headershelper_test.py

    1 from HeadersHelper import HeadersHelper
    2 url = "https://www.zhibo8.cc"
    3 #==============================
    4 #hh = HeadersHelper(url)
    5 #print(hh.feedbak_info())
    6 #==============================
    7 path = "E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo5/headershelper.html"
    8 hh = HeadersHelper(url, path=path)
    9 hh.save_InFile()
  • 相关阅读:
    spring mvc处理json
    Unable to read TLD "META-INF/c.tld"错误
    linux设置tomcat开机自动启动
    linux tomcat自启动设置
    向PE文件植入后门代码技术讨论
    对自助提卡系统EDLM的一次代码审计
    XE下显示托盘图标(TrayIcon)
    delphi 查看编译版本
    Context Menus
    API Monitor---------------Using API Monitor to crack copy protected software
  • 原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7134738.html
Copyright © 2011-2022 走看看