zoukankan      html  css  js  c++  java
  • 爬虫学习04.

    对加密数据的爬取
    import requests
    from lxml import etree
    import base64
    import os
    from urllib import request
    url = 'http://jandan.net/ooxx/page-46'
    headers = {
        'User-Agent':'ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    page_text = requests.get(url=url, headers=headers).text
    #print(page_text)
    tree = etree.HTML(page_text)
    code_list = tree.xpath('//div[@class="text"]/p/span[@class="img-hash"]/text()')
    #print(code_list)
    if not os.path.exists('dandan'):
        os.mkdir('dandan')
    for code in code_list:
        #print(code)
        src = base64.b64decode(code).decode()
        img_url = 'https:' + src
        filepath = 'dandan/' + src.split('/')[-1]
        #print(src)
        request.urlretrieve(url=img_url, filename=filepath)
        print(filepath + '下载成功')
     

    二.验证码处理及模拟登陆

    1.相关的门户网站在进行登录的时候,如果用户连续登录的次数超过3次或者5次的时候,就会在登录页中动态生成验证码。通过验证码达到分流和反爬的效果。
    2.- 1.对携带验证码的页面数据进行抓取
    - 2.可以将页面数据中验证码进行解析,验证码图片下载到本地
    - 3.可以将验证码图片提交给三方平台进行识别,返回验证码图片上的数据值
        - 云打码平台:
            - 1.在官网中进行注册(普通用户和开发者用户)
            - 2.登录开发者用户:
                - 1.实例代码的下载(开发文档-》调用实例及最新的DLL-》PythonHTTP实例下载)
                - 2.创建一个软件:我的软件-》添加新的软件
            -3.使用示例代码中的源码文件中的代码进行修改,让其识别验证码图片中的数据值
    3.账号:1355144989@qq.com  密码1355144989
     
    模拟登陆爬取人人网
     
    import requests
    from lxml import etree
    from urllib import request
    url = 'http://www.renren.com/SysHome.do'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(page_text)
    src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    print(src)
    img = request.urlretrieve(url=src, filename='./renren.jpg')
    # 登录
    code = get_code(2004,'./renren.jpg')
    session = requests.Session()
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201903160368'
    data = {
        "email":"15516092050",
    "icode":code,
    "origURL":"http://www.renren.com/home",
    "domain":"renren.com",
    "key_id":'1',
    "captcha_type":"web_login",
    "password":"5e088a2ee22d34dd081aac25578e67bd3a2d851cdfbcf1f0c9ab7056bd1bad62",
    "rkey":"3f4696f6fa1b89e9061868300bf11484",
    "f":"http%3A%2F%2Fwww.renren.com%2F969395731",
    }
    login_page = session.post(url=login_url, headers=headers,data=data)
    print(login_page.headers)
    detail_url = 'http://www.renren.com/969395731'
    detail_content = session.get(url=url,headers=headers).text
    with open('./renren.html', 'w', encoding='utf-8') as f:
        f.write(detail_content)
        print('下载成功')

    三.并发爬取视频

    from multiprocessing.dummy import Pool  # 开启多线程
    from lxml import etree
    import requests
    import re
    pool = Pool(5)   # 创建一个线程池
    url = 'https://www.pearvideo.com/category_3'
    headers = {
        'User-Agent':'ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    page_text = requests.get(url=url, headers = headers).text
    tree = etree.HTML(page_text)
    video_url_list = tree.xpath('//*[@id="listvideoListUl"]/li/div/a/@href')
    print(video_url_list)
    after_url_list = []
    film_name_list = []
    for video_url in video_url_list:
        detail_url = 'https://www.pearvideo.com/' + video_url
        video_detail_page = requests.get(url = detail_url, headers=headers).text
        after_url = re.findall('ldUrl="",srcUrl="(.*?)"', video_detail_page)[0]
        trees = etree.HTML(video_detail_page)
        film_name = trees.xpath('//div[@class="video-main"]/div/img[@class="img"]/@alt')[0]
        film_name_list.append(film_name)
        after_url_list.append(after_url)
    print(after_url_list)
    get_video_data = lambda after_url:requests.get(url=after_url, headers=headers).content
    video_data_list = pool.map(get_video_data,after_url_list )
    def get_video_name(video_data):
        name = str(video_data_list.index(video_data))
        with open(f'./{name}.mp4', 'wb') as f:
            f.write(video_data)
            print(name + '下载成功')
    pool.map(get_video_name,video_data_list )
  • 相关阅读:
    bzoj1593[Usaco2008 Feb]Hotel旅馆
    spoj1182 Sorted bit squence/[USACO2003 Dec]Cow Queueing
    [USACO2003 Dec]Cow Queueing数数的梦 (基础水数位DP带注释!)
    后缀数组模版+注释
    bzoj1690/poj3621[Usaco2007 Dec]奶牛的旅行
    bzoj1731/poj3169[Usaco2005 dec]Layout 排队布局
    bzoj2467[中山市选2010]生成树
    bzoj1594[Usaco2008 Jan]Haybale Guessing猜数游戏
    poj 1035 -- Spell checker
    poj 1611 -- The Suspects
  • 原文地址:https://www.cnblogs.com/hu13/p/9273192.html
Copyright © 2011-2022 走看看