zoukankan      html  css  js  c++  java
  • Python 爬虫 (二)

    cookiejar模块:

    • 管理储存cookie,将传出的http请求添加cookie
    • cookie存储在内存中,CookieJar示例回收后cookie将自动消失

    实例:用cookjar访问人人网主页

     1 import json
     2 from urllib import request,parse
     3 from http import cookiejar
     4 
     5 #实例化一个cookiejar对象
     6 cookiejar_object = cookiejar.CookieJar()
     7 #生成cookie管理器
     8 cookie_handler = request.HTTPCookieProcessor(cookiejar_object)
     9 #有了opener,就可以代替urlopen来进行请求网页
    10 opener = request.build_opener(cookie_handler)
    11 
    12 
    13 #获取网页的登陆接口
    14 url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018721532875'
    15 
    16 #将网页的form信息获取
    17 form = {
    18     'email':'xxx',
    19     'icode':'',
    20     'origURL':'http://www.renren.com/home',
    21     'domain':'renren.com',
    22     'key_id':'1',
    23     'captcha_type':'web_login',
    24     'password':'xxx',
    25     'rkey':'79d8184f25d678248262a91caf3e7ea8',
    26     'f':'http%3A%2F%2Fzhibo.renren.com%2Ftop',
    27 }
    28 
    29 
    30 #将数据转换成二进制
    31 form_b = parse.urlencode(form).encode('utf-8')
    32 
    33 #将url和表单信息,还有获取到的cookie去访问网页
    34 response = opener.open(url,form_b)
    35 html_b = response.read()#字节类型
    36 # print(html_b)
    37 
    38 
    39 res_dict = json.loads(html_b.decode('utf-8'))
    40 #获取登陆后的个人主页url
    41 home_url = res_dict['homeUrl']
    42 
    43 # print(home_url)
    44 # 访问个人主页
    45 response = opener.open(home_url)
    46 html_bytes = response.read()
    47 print(html_bytes.decode('utf-8'))

    python Proxy代理

     1 from urllib import request
     2 
     3 proxy = {
     4     'http': 'http://219.141.153.41:80'
     5 }
     6 
     7 url = 'http://www.baidu.com/s?wd=ip'
     8 # request.HTTPCookieProcessor(cookie)
     9 handler = request.ProxyHandler(proxy)
    10 
    11 # 生成 opener.open = urlopen
    12 opener = request.build_opener(handler)
    13 
    14 # 同过opener访问百度
    15 
    16 response = opener.open(url, timeout=5)
    17 # 存储页面
    18 with open('baidu.html', 'wb') as f:
    19     f.write(response.read())

    有道词典翻译接口 

     1 import time
     2 import random
     3 import json
     4 from Day1.tuozhan_all import post
     5 
     6 def md5_my(need_str):
     7     import hashlib
     8     # 创建md5对象
     9     md5_o = hashlib.md5()
    10     # 需要有bytes, 作为参数
    11     # 由str, 转换成 bytes encode-------str.encode('utf-8')
    12     # 由bytes转换成 str, decode---------bytes.decode('utf-8')
    13     sign_bytes = need_str.encode('utf-8')
    14     # 更新md5 object的值
    15     md5_o.update(sign_bytes)
    16     sign_str = md5_o.hexdigest()
    17     return sign_str
    18 
    19 # url
    20 
    21 def translate(kw):
    22     url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
    23 
    24     headers = {
    25         'Accept': 'application/json, text/javascript, */*; q=0.01',
    26         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    27         'Connection': 'keep-alive',
    28         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    29         'Cookie': 'OUTFOX_SEARCH_USER_ID=-493176930@10.168.8.63; OUTFOX_SEARCH_USER_ID_NCOO=38624120.26076847; SESSION_FROM_COOKIE=unknown; JSESSIONID=aaabYcV4ZOU-JbQUha2uw; ___rl__test__cookies=1534210912076',
    30         'Host': 'fanyi.youdao.com',
    31         'Origin': 'http://fanyi.youdao.com',
    32         'Referer': 'http://fanyi.youdao.com/',
    33         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    34         'X-Requested-With': 'XMLHttpRequest',
    35     }
    36 
    37 
    38     # form 的生成1. i 需要确定, 2, salt, 3, sign
    39     key= kw
    40 
    41     # salt : ((new Date).getTime() + parseInt(10 * Math.random(), 10))
    42     salt = int(time.time()*1000 + random.randint(0,10))
    43     print('salt:',salt)
    44     salt_str = str(salt)
    45 
    46     # sign : o = u.md5(S + n + r + D);
    47     # S = "fanyideskweb"
    48     # D = "ebSeFb%=XZ%T[KZ)c(sy!"
    49     # n = key
    50     # r = salt_str
    51     S = "fanyideskweb"
    52     D = "ebSeFb%=XZ%T[KZ)c(sy!"
    53     sign_str = S + key + salt_str + D
    54     # md5 加密的方法
    55     sign_md5_str = md5_my(sign_str)
    56 
    57     form = {
    58         'i': key,
    59         'from': 'AUTO',
    60         'to': 'AUTO',
    61         'smartresult': 'dict',
    62         'client': 'fanyideskweb',
    63         'salt': salt_str,
    64         'sign': sign_md5_str,
    65         'doctype': 'json',
    66         'version': '2.1',
    67         'keyfrom': 'fanyi.web',
    68         'action': 'FY_BY_REALTIME',
    69         'typoResult': 'false',
    70     }
    71 
    72     html_bytes = post(url, form, headers=headers)
    73 
    74     # 将 json 类型的 str, 转化成, 字典
    75     res_dict = json.loads(html_bytes.decode('utf-8'))
    76     #print(html_bytes.decode('utf-8'))
    77 
    78     translate_res = res_dict['translateResult'][0][0]['tgt']
    79     return translate_res
    80 
    81 if __name__ == '__main__':
    82     ret = translate('中国')
    83 
    84     print('翻译的结果:' + ret)
  • 相关阅读:
    怎么制作html5网站页面让它适应电脑和手机的尺寸
    js面向对象 下
    认识面向对象及代码示例
    Math 对象
    js事件驱动函数
    模拟js中注册表单验证
    敏感词过滤 简单 模仿
    模仿随机验证码-简单效果
    字符串方法(函数)
    js中字符串概念
  • 原文地址:https://www.cnblogs.com/pantom0122/p/9478091.html
Copyright © 2011-2022 走看看