zoukankan      html  css  js  c++  java
  • requests模块高级.ipynb、获取cookie、代理操作、代理池、爬西刺免费代理IP、爬雪球网、模拟登陆古诗文网、验证码的识别、进程(multiprocessing)中的线程(dummy)、协程、多任务、flask_server、单线程+多任务异步协程在爬虫中的应用、

    - HttpConnectinPool:
    - 原因:
    - 1.短时间内发起了高频的请求导致ip被禁
    - 2.http连接池中的连接资源被耗尽
    - 解决:
    - 1.代理
    - 2.headers中加入Conection:“close”
    - 代理:代理服务器,可以接受请求然后将其转发。
    - 匿名度
    - 高匿:啥也不知道
    - 匿名:知道你使用了代理,但是不知道你的真实ip
    - 透明:知道你使用了代理并且知道你的真实ip
    - 类型:
    - http
    - https
    - 免费代理:
    - www.goubanjia.com
    - 快代理
    - 西祠代理
    - http: // http.zhiliandaili.cn /

    获取cookie.py

    import os
    import sqlite3
    import win32crypt

    username = os.environ.get('USERNAME')
    cookie_file = 'C:/Users/{UserName}/AppData/Local/Google/Chrome/User Data/Default/Cookies'.format(UserName=username)
    con = sqlite3.connect(cookie_file)
    cursor = con.cursor()
    sql = 'SELECT host_key, name, value, encrypted_value FROM cookies WHERE name = "xxxxx"' and 'host_key="xxxxx";'
    try:
    if cursor.execute(sql):
    for en_value in cursor:
    pwdHash = en_value[3]
    if pwdHash:
    ret = win32crypt.CryptUnprotectData(pwdHash, None, None, None, 0)
    a = bytes.decode(ret[1])
    except Exception as e:
    print(e)

    代理操作.py

    import requests

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    url = "https://www.baidu.com/s?wd=ip"
    page_text = requests.get(url, headers=headers, proxies={"https": "111.231.94.44:8888"}).text
    with open("ip1.html", "w", encoding="utf-8") as fp:
    fp.write(page_text)

    代理池.py

    import random
    import requests

    proxy_list = [
    {'https': '111.231.94.44:8888'},
    {'https': '121.231.94.44:8888'},
    {'https': '131.231.94.44:8888'},
    ]
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    url = "https://www.baidu.com/s?wd=ip"
    page_text = requests.get(url, headers=headers, proxies=random.choice(proxy_list)).text
    with open("ip1.html", "w", encoding="utf-8") as fp:
    fp.write(page_text)

    爬西刺免费代理IP.py

    import requests
    import random
    from lxml import etree

    # 伪造请求头的连接状态:
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
    "Connection": "close",
    }

    # 要爬取的网站:
    url = "https://www.xicidaili.com/nn/%d"

    # 获取代理ip的URL并放入代理池:
    ip_url = "http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=0&qty=50&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=2"
    page_text = requests.get(ip_url, headers=headers).text
    tree = etree.HTML(page_text)
    ip_list = tree.xpath("//body//text()")
    print("代理池中的ip有:", ip_list)

    # HTTP和HTTPS池中的ip
    proxy_list_http = []
    proxy_list_https = []

    # 拿到前20页的ip信息:ip地址+端口+协议
    for page in range(1, 20):
    new_url = format(url % page)
    ip_port = random.choice(ip_list)
    page_text = requests.get(new_url, headers=headers,
    proxies={'https': ip_port}).text # verity=False是忽略信息操作、proxies是加代理的ip地址和端口
    tree = etree.HTML(page_text)
    tr_list = tree.xpath('//*[@id="ip_list"]//tr')[1:]
    for tr in tr_list:
    ip = tr.xpath('./td[2]/text()')[0]
    port = tr.xpath('./td[3]/text()')[0]
    t_type = tr.xpath('./td[6]/text()')[0]
    ips = ip + ":" + "port"
    if t_type == "HTTP":
    dic = {
    t_type: ips
    }
    proxy_list_http.append(dic)
    else:
    dic = {
    t_type: ips
    }
    proxy_list_https.append(dic)
    print(len(proxy_list_http), len(proxy_list_https)) # HTTP和HTTPS池中的ip数量

    # 检测:
    for ip in proxy_list_http:
    response = requests.get("https://www.sougou.com", headers=headers, proxies={"https": ip})
    if response.status_code == "200":
    print('检测到了可用ip')
    - cookie的处理
    - 手动处理:将cookie封装到headers中
    - 自动处理:session对象。可以创建一个session对象,改对象可以像requests一样进行请求发送。不同之处在于如果在使用session进行请求发送的过程中产生了cookie,则cookie会被自动存储在session对象中。

    爬雪球网.py

    import requests
    from lxml import etree

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
    }
    session = requests.Session()
    session.get("https://xueqiu.com", headers=headers) # 用session发请求
    url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20365831&count=15&category=-1"
    page_text = session.get(url=url, headers=headers).json()
    print(page_text)

    模拟登陆古诗文网.py

    import requests
    from lxml import etree
    from hashlib import md5


    class Chaojiying_Client(object):
    def __init__(self, username, password, soft_id):
    """用户名、密码、软件id"""
    self.username = username
    password = password.encode('utf8')
    self.password = md5(password).hexdigest()
    self.soft_id = soft_id
    self.base_params = {
    'user': self.username,
    'pass2': self.password,
    'softid': self.soft_id,
    }
    self.headers = {
    'Connection': 'Keep-Alive',
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
    }

    def PostPic(self, im, codetype):
    """
    im: 图片字节
    codetype: 题目类型 参考 http://www.chaojiying.com/price.html
    """
    params = {
    'codetype': codetype,
    }
    params.update(self.base_params)
    files = {'userfile': ('ccc.jpg', im)}
    r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
    headers=self.headers)
    return r.json()

    def ReportError(self, im_id):
    """
    im_id:报错题目的图片ID
    """
    params = {
    'id': im_id,
    }
    params.update(self.base_params)
    r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
    return r.json()


    def tranformImgData(imgPath, t_type):
    """识别功能"""
    chaojiying = Chaojiying_Client('17338132275', '17338132275', '903523') # 用户中心>>软件ID 生成一个替换 96001
    im = open(imgPath, 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    return chaojiying.PostPic(im, t_type)["pic_str"] # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()


    # 伪造请求头:
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
    }

    # session请求:
    s = requests.Session()

    # 要识别的url:
    url = "https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx"
    page_text = s.get(url, headers=headers).text
    tree = etree.HTML(page_text)

    # 图片的url:
    img_src = 'https://so.gushiwen.org/' + tree.xpath('//*[@id="imgCode"]/@src')[0]

    # 识别到的图片数据:
    img_data = s.get(img_src, headers=headers).content

    # 储存:
    with open("./code.jpg", "wb") as f:
    f.write(img_data)

    print("识别到的验证码为:", tranformImgData("./code.jpg", 1004))

    # 动态获取变化的请求参数
    __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
    __VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]

    # 获取到的验证码为:
    code_text = tranformImgData("./code.jpg", 1004)
    print("获取到的验证码为", code_text)

    # 登陆的url
    login_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'

    # post请求验证的参数:
    data = {
    "__VIEWSTATE": __VIEWSTATE,
    "__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR,
    "from": "http://so.gushiwen.org/user/collect.aspx",
    "email": "www.zhangbowudi@qq.com",
    "pwd": "bobo328410948",
    "code": code_text,
    "denglu": "登录",
    }
    page_text1 = s.post(url=login_url, headers=headers, data=data).text
    with open("login.html", "w", encoding="utf-8") as fp:
    fp.write(page_text1)
    - 验证码的识别
    - 超级鹰:http://www.chaojiying.com/about.html
    - 注册:(用户中心身份)
    - 登陆:
    - 创建一个软件:899370
    - 下载示例代码
    - 打码兔
    - 云打码
    - 动态变化的请求参数
    - 通常情况下动态变化的请求参数都会被隐藏在前台页面源码中
    ### 单线程+多任务异步协程
    - 协程
    - 在函数(特殊的函数)定义的时候,如果使用了async修饰的话,则改函数调用后会返回一个协程对象,并且函数内部的实现语句不会被立即执行
    - 任务对象
    - 任务对象就是对协程对象的进一步封装。任务对象==高级的协程对象==特殊的函数
    - 任务对象时必须要注册到事件循环对象中
    - 给任务对象绑定回调:爬虫的数据解析中
    - 事件循环
    - 当做是一个容器,容器中必须存放任务对象。
    - 当启动事件循环对象后,则事件循环对象会对其内部存储任务对象进行异步的执行。
    - aiohttp:支持异步网络请求的模块

    进程(multiprocessing)中的线程(dummy).py

    import time
    import requests
    # 导入进程(multiprocessing)中的线程(dummy)
    from multiprocessing.dummy import Pool
    from time import sleep

    # 开始时间:
    start = time.time()

    # urls列表:
    urls = [
    "http://127.0.0.1:5000/index",
    "http://127.0.0.1:5000/index",
    "http://127.0.0.1:5000/index",
    ]


    def get_request(url):
    """获取请求的url"""
    pass


    # 实例化3个线程对象处理异步操作:
    pool = Pool(3)
    pool.map(get_request, urls) # 自定义函数get_request处理每个列表的元素

    print("总耗时", time.time() - start)
    结果: 正在下载: www.1.com
    正在下载: www.2.com
    正在下载: www.3.com
    下载结束
    下载结束
    下载结束
    总耗时 2.011126756668091

    server1.py

    import time
    import requests
    from flask import Flask
    from time import sleep

    # 实例化:
    app = Flask(__name__)


    # 增加路由:
    @app.route("/index")
    def index():
    """主页功能"""
    sleep(2)
    return "hello"

    @app.route("/index1")
    def index1():
    """"""
    sleep(2)
    return "hello1"


    if __name__ == '__main__':
    app.run()

    server2.py

    import time
    import requests
    # 导入进程(multiprocessing)中的线程(dummy)
    from multiprocessing.dummy import Pool
    from time import sleep

    # 开始时间:
    start = time.time()

    # urls列表:
    urls = [
    "http://127.0.0.1:5000/index1",
    "http://127.0.0.1:5000/index",
    ]


    def get_request(url):
    """获取请求的url"""
    page_text = requests.get(url).text
    print(page_text)


    # 实例化3个线程对象处理异步操作:
    pool = Pool(5)
    pool.map(get_request, urls) # 自定义函数get_request处理每个列表的元素

    print("总耗时", time.time() - start)

    协程.py

    import asyncio


    def callback(task):
    """作为任务对象的回调函数"""
    print("i am callback and ", task.result())


    # 函数前面加async修饰就是协程:
    async def test():
    print("i am test()")
    return "bobo"


    c = test() # c是返回的协程对象

    # 封装一个任务对象:
    task = asyncio.ensure_future(c)

    # 给任务对象绑定回调:
    task.add_done_callback(callback)

    # 创建一个事件循环的对象:
    loop = asyncio.get_event_loop()

    # 任务对象放入到事件循环对象中:
    loop.run_until_complete(task)

    多任务.py

    import asyncio
    import time

    start = time.time()


    # 函数前面加async修饰就是协程、在特殊函数内部的实现中不可以出现不支持异步的模块代码
    async def get_request(url):
    asyncio.sleep(2)
    print("下载成功:", url)


    urls = [
    "www.1.com",
    "www.2.com",
    ]

    tasks = [] # 多任务列表
    for url in urls:
    c = get_request(url)
    task = asyncio.ensure_future(c) # 生成了任务对象
    tasks.append(task)

    loop = asyncio.get_event_loop() # 创建事件循环对象
    loop.run_until_complete(asyncio.wait(tasks)) # 多任务对象放入到事件循环对象中并挂起、
    print(time.time() - start)

    flask_server.py

    from flask import Flask
    import time

    app = Flask(__name__)


    @app.route('/bobo')
    def index_bobo():
    time.sleep(2)
    return 'Hello bobo'


    @app.route('/jay')
    def index_jay():
    time.sleep(2)
    return 'Hello jay'


    @app.route('/tom')
    def index_tom():
    time.sleep(2)
    return 'Hello tom'


    if __name__ == '__main__':
    app.run(threaded=True)

    单线程+多任务异步协程在爬虫中的应用.py

    import requests
    import aiohttp
    import time
    import asyncio

    s = time.time()
    urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/jay'
    ]
    # 特殊的函数:请求发送和响应数据的捕获
    # 细节:在每一个with前加上async,在每一个阻塞操作的前边加上await
    async def get_request(url):
    async with aiohttp.ClientSession() as s:
    async with await s.get(url=url) as response:
    page_text = await response.text()  read()返回的是byte类型的数据
                print(page_text)
    return page_text


    tasks = []
    for url in urls:
    c = get_request(url)
    task = asyncio.ensure_future(c)
    tasks.append(task)

    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))

    print(time.time() - s)
  • 相关阅读:
    Android 使用ViewPager结合PhotoView开源组件实现网络图片在线浏览功能
    Android教程 -06 Activity的生命周期
    Android教程 -05 Android6.0权限的管理
    Android 设置ImageView宽度固定,其高度按比例缩放适应
    一技压身,天下行走
    解析P2P金融的业务安全
    Android Listview中Button按钮点击事件冲突解决办法
    Android 动态设置TextView的drawableLeft等属性
    Android教程 -04 启动其它Activity,静态工厂设计模式传递数据
    瞬息之间与时间之门
  • 原文地址:https://www.cnblogs.com/zhang-da/p/12323932.html
Copyright © 2011-2022 走看看