zoukankan      html  css  js  c++  java
  • python实训day4

    今天是python实训的第四天,今天教的主要是一些使用的技能。

    一、python爬虫:

    1.爬取梨视频页面全部视频:

     
    '''
    爬取梨视频:
    请求url:
        https://www.pearvideo.com/
         
    请求方式:
        GET
         
    请求头:
        user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
    '''
     
    # import requests
    # import re  # 正则模块
    #
    # # 1、对梨视频详情页发送请求,获取响应数据
    # response = requests.get(url='https://www.pearvideo.com/')
    # print(response.status_code)
    # print(response.text)
    #
    # # re.findall('正则匹配规则', '解析文本', "正则模式")
    # # re.S: 全局模式 (对整个文本行进匹配)
    # # .指的是当前位置
    # # *指的是查找所有
    # '''
    # <a href="video_1543373"
    # <a href="video_(.*?)"  # 提取1543373
    # '''
    #
    # # 2、获取主页视频详情页ID
    # res = re.findall('<a href="video_(.*?)"', response.text, re.S)
    # print(res)
    #
    #
    # for m_id in res:
    #     # 拼接详情页url
    #     detail_url = 'https://www.pearvideo.com/video_' + m_id
    #     print(detail_url)
     
     
     
    import requests
    import re  # 正则模块
    # uuid.uuid4()  可以根据时间戳生成一段世界上唯一的随机字符串
    import uuid
     
    # 爬虫三部曲
     
    # 1、发送请求
    def get_page(url):
        response = requests.get(url)
        return response
     
    # 2、解析数据
    # 解析主页获取视频详情页ID
    def parse_index(text):
        res = re.findall('<a href="video_(.*?)"', text, re.S)
        # print(res)
     
        detail_url_list = []
        for m_id in res:
            # 拼接详情页url
            detail_url = 'https://www.pearvideo.com/video_' + m_id
            # print(detail_url)
            detail_url_list.append(detail_url)
     
        # print(detail_url_list)
     
        return detail_url_list
     
    # 解析详情页获取视频url
    def parse_detail(text):
        ''''''
        '''
            (.*?): 提取括号的内容
            .*?: 直接匹配
            <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4" style=" 100%; height: 100%;"></video>
             
        正则: <video.*?src="(.*?)"
         
        # 以上是分析过程,不需要写
         
        正则: srcUrl="(.*?)"
        '''
        movie_url = re.findall('srcUrl="(.*?)"', text, re.S)[0]
        return movie_url
     
     
    # 3、保存数据
    def save_movie(movie_url):
        response = requests.get(movie_url)
        # 把视频写到本地
        with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
            f.write(response.content)
            f.flush()
     
    if __name__ == '__main__':  # main + 回车键
     
        # 1、对主页发送请求
        index_res = get_page(url='https://www.pearvideo.com/')
     
        # 2、对主页进行解析、获取详情页id
        detail_url_list = parse_index(index_res.text)
        # print(detail_url_list)
     
        # 3、对每个详情页url发送请求
        for detail_url in detail_url_list:
            detail_res = get_page(url=detail_url)
            print(detail_res.text)
     
            # 4、解析详情页获取视频url
            movie_url = parse_detail(detail_res.text)
            print(movie_url)
     
            # 5、保存视频
            save_movie(movie_url)
    

      2.以上内容更高性能爬取:

    import requests
    import re  # 正则模块
    # uuid.uuid4()  可以根据时间戳生成一段世界上唯一的随机字符串
    import uuid
    # 导入线程池模块
    from concurrent.futures import ThreadPoolExecutor
    # 线程池限制50个线程
    pool = ThreadPoolExecutor(50)
     
    # 爬虫三部曲
     
    # 1、发送请求
    def get_page(url):
        print(f'开始异步任务: {url}')
        response = requests.get(url)
        return response
     
     
    # 2、解析数据
    # 解析主页获取视频详情页ID
    def parse_index(res):
     
        response = res.result()
        # 提取出主页所有ID
        id_list = re.findall('<a href="video_(.*?)"', response.text, re.S)
        # print(res)
     
        # 循环id列表
        for m_id in id_list:
            # 拼接详情页url
            detail_url = 'https://www.pearvideo.com/video_' + m_id
            # print(detail_url)
            # 把详情页url提交给get_page函数
            pool.submit(get_page, detail_url).add_done_callback(parse_detail)
     
     
    # 解析详情页获取视频url
    def parse_detail(res):
        response = res.result()
        movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
        # 异步提交把视频url传给get_page函数,把返回的结果传给save_movie
        pool.submit(get_page, movie_url).add_done_callback(save_movie)
     
     
    # 3、保存数据
    def save_movie(res):
     
        movie_res = res.result()
     
        # 把视频写到本地
        with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
            f.write(movie_res.content)
            print(f'视频下载结束: {movie_res.url}')
            f.flush()
     
     
    if __name__ == '__main__':  # main + 回车键
     
        # 一 往get_page发送异步请求,把结果交给parse_index函数
        url = 'https://www.pearvideo.com/'
        pool.submit(get_page, url).add_done_callback(parse_index)
    

      3.requests的详细使用:

    GET请求讲解
    '''
    '''
    User-Agent
    # 访问知乎发现
    请求url:
        https://www.zhihu.com/explore
         
    请求方式:
        GET
         
    请求头:
        user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
         
        cookies
    '''
     
    # 访问知乎
    # import requests
    # response = requests.get(url='https://www.zhihu.com/explore')
    # print(response.status_code)  # 400
    # print(response.text)  # 返回错误页面
     
     
    # 携带请求头参数访问知乎:
    import requests
     
    # 请求头字典
    # headers = {
    #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
    # }
    # 在get请求内,添加user-agent
    # response = requests.get(url='https://www.zhihu.com/explore', headers=headers)
    # print(response.status_code)  # 200
    # # print(response.text)
    # with open('zhihu.html', 'w', encoding='utf-8') as f:
    #     f.write(response.text)
     
     
    '''
    params请求参数
    访问百度搜查安徽工程大学url
    https://www.baidu.com/s?wd=安徽工程大学&pn=10
    https://www.baidu.com/s?wd=安徽工程大学&pn=20
     
    # '''
    from urllib.parse import urlencode
    # url = 'https://www.baidu.com/s?wd=%E8%94%A1%E5%BE%90%E5%9D%A4'
    # url = 'https://www.baidu.com/s?' + urlencode({"wd": "蔡徐坤"})
    url = 'https://www.baidu.com/s?'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
    }
    # print(url)
    # 在get方法中添加params参数
    # response = requests.get(url, headers=headers, params={"wd": "安徽工程大学"})
    response = requests.get(url, headers=headers, params={"wd": "安徽工程大学", "pn": "20"})
    # print(response.text)
    with open('gongcheng2.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
     
     
    '''
    携带cookies
    携带登录cookies破解github登录验证
     
    请求url:
        https://github.com/settings/emails
         
    请求方式:
        GET
         
    请求头:
        User-Agen
         
        Cookie: has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam; _gh_sess=ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0%3D--04f6f3172b5d01244670fc8980c2591d83864f60
         
    '''
    import requests
     
    # 请求url
    url = 'https://github.com/settings/emails'
     
    # 请求头
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
        # 在请求头中拼接cookies
        # 'Cookie': 'has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam; _gh_sess=ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0%3D--04f6f3172b5d01244670fc8980c2591d83864f60'
    }
    # github_res = requests.get(url, headers=headers)
     
    import requests
    cookies = {
        'Cookie': 'has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam; _gh_sess=ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0%3D--04f6f3172b5d01244670fc8980c2591d83864f60'
    }
     
    github_res = requests.get(url, headers=headers, cookies=cookies)
     
    print('15622792660' in github_res.text)
    

      4.实例:爬取豆瓣top250电影信息:

    ''''''
    '''
    主页:
        https://movie.douban.com/top250
        GET
        User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
         
    re正则:
        # 电影详情页url、图片链接、电影名称、电影评分、评价人数
        <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价
    '''
    import requests
    import re
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
    }
    # 1、往豆瓣TOP250发送请求获取响应数据
    response = requests.get(url, headers=headers)
     
    # print(response.text)
     
    # 2、通过正则解析提取数据
    # 电影详情页url、图片链接、电影名称、电影评分、评价人数
    movie_content_list = re.findall(
        # 正则规则
        '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
     
        # 解析文本
        response.text,
     
        # 匹配模式
        re.S)
     
    for movie_content in movie_content_list:
        # 解压赋值每一部电影
        detail_url, movie_jpg, name, point, num = movie_content
        data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} 
    '
        print(data)
     
        # 3、保存数据,把电影信息写入文件中
        with open('douban.txt', 'a', encoding='utf-8') as f:
            f.write(data)
    

      

     
     
     
  • 相关阅读:
    自定义类似MessageBox小窗体操作
    WinForm 遍历用户控件里CheckBox
    WinForm BaseClass类常用通用方法
    分页的几种写法
    克服粗心毛病的伪代码
    C++queue的使用
    Unix网络编程学习 < 一 >
    使用log4cxx
    memset函数
    libcurl在windows下的使用
  • 原文地址:https://www.cnblogs.com/jacob1998/p/11025643.html
Copyright © 2011-2022 走看看