zoukankan      html  css  js  c++  java
  • 爬虫小案例——爬取校花网

    爬取校花网图片

    # 页面路由规律
    # href = "http://www.xiaohuar.com/list-1-0.html" 第一页
    # href = "http://www.xiaohuar.com/list-1-1.html" 第二页
    # href = "http://www.xiaohuar.com/list-1-2.html" 第三页
    # href = "http://www.xiaohuar.com/list-1-3.html" 第四页
    
    
    # 生成所有的页码
    def get_page_url():
        for i in range(2):
            yield 'http://www.xiaohuar.com/list-1-{}.html'.format(i)
    # for url in get_page_url():
    #     print(url)
    
    
    from requests_html import HTMLSession
    import os
    session = HTMLSession()
    
    # 第一页解析测试
    # url = 'http://www.xiaohuar.com/list-1-0.html'
    # r = session.request(method='get', url=url, headers=headers)
    # # print(r.text)
    # img_element_list = r.html.find('[class="img"] img')
    # # print(img_element_list)
    # for img_element in img_element_list:
    #     print(img_element.attrs.get('alt'))
    #     print(r.html.base_url[:-1] + img_element.attrs.get('src'))
    
    
    # 解析页面,获取图片名和url
    def parse_page(url):
        r = session.request(method='get', url=url)
    
        img_element_list = r.html.find('[class="img"] img')
    
        for img_element in img_element_list:
            file_name = img_element.attrs.get('alt').replace('/', '').replace('\', '') + '.png'
            print(file_name)
            file_url = img_element.attrs.get('src')
            file_url = r.html.base_url[:-1] + file_url if not file_url.startswith('http') else file_url   # 处理相对路径和绝对路径
            save_file(file_name, file_url)
    
    
    def save_file(name, url):
        base_path = '校花图片'
        file_path = os.path.join(base_path, name)
        r = session.get(url=url)
        with open(file_path, 'wb') as f:
            f.write(r.content)
            print('%s下载成功' % name)
    
    
    if __name__ == '__main__':
        for page_url in get_page_url():
            parse_page(page_url)

    爬取校花网视频

    # 页面路由规律
    # http://www.xiaohuar.com/list-3-0.html 第一页
    # http://www.xiaohuar.com/list-3-1.html 第二页
    # http://www.xiaohuar.com/list-3-2.html 第三页
    # http://www.xiaohuar.com/list-3-3.html 第四页
    # http://www.xiaohuar.com/list-3-4.html 第五页
    # http://www.xiaohuar.com/list-3-5.html 第六页
    
    
    from requests_html import HTMLSession
    import os
    session = HTMLSession()
    
    
    # 获取索引页url
    def get_index_page():
        for i in range(6):
            url = 'http://www.xiaohuar.com/list-3-%s.html' % i
            yield url
    
    
    # 解析索引页测试
    # url = 'http://www.xiaohuar.com/list-3-5.html'
    # r = session.get(url=url)
    # # print(r.html.find('#images a[class="imglink"]'))
    # for element in r.html.find('#images a[class="imglink"]'):
    #     print(element.attrs.get('href'))
    
    
    # 解析索引页获取详情页url
    def get_detail_page(url):
        r = session.get(url=url)
    
        for element in r.html.find('#images a[class="imglink"]'):
            print(element.attrs.get('href'))
            yield element.attrs.get('href')
    
    
    # 测试解析详情页获取视频url,名字
    # url = 'http://www.xiaohuar.com/p-3-13.html'
    # # url = 'http://www.xiaohuar.com/p-3-5.html'
    # r = session.get(url=url)
    # r.html.encoding = 'gbk'
    # file_name = r.html .find('title', first=True).text.replace('\', '')
    #
    # print(file_name)
    #
    # element = r.html.find('#media source', first=True)
    # if element:
    #     video_url = element.attrs.get('src')
    #     print(video_url)
    # else:
    #     video_url = r.html.search('var vHLSurl    = "{}";')[0]
    #     print(video_url)
    
    
    # 解析详情页获取视频url,名字
    def get_url_name(url):
        r = session.get(url=url)
        r.html.encoding = 'gbk'
        file_name = r.html.find('title', first=True).text.replace('\', '')
        print(file_name)
    
        element = r.html.find('#media source', first=True)
        if element:
            video_url = element.attrs.get('src')
            video_type = 'mp4'
        else:
            video_url = r.html.search('var vHLSurl    = "{}";')[0]
            video_type = 'm3u8'
        return file_name, video_url, video_type
    
    
    # 保存文件
    def save(file_name, video_url, video_type):
        if video_type == 'mp4':
            file_name += '.mp4'
            r = session.get(url=video_url)
            with open(file_name, 'wb') as f:
                f.write(r.content)
        elif video_type == 'm3u8':
            save_m3u8(file_name, video_url)
    
    
    # 处理m3u8
    def save_m3u8(file_name, video_url):
        if not os.path.exists(file_name):
            os.mkdir(file_name)
        r = session.get(url=video_url)
        m3u8_path = os.path.join(file_name, 'playlist.m3u8')
        with open(m3u8_path, 'wb') as f:
            f.write(r.content)
        # print(r.text)
        for line in r.text:
            if line.endswith('ts'):
                ts_url = video_url.replace('playlist.m3u8', line)
                ts_path = os.path.join(file_name, line)
                r1 = session.get(url=ts_url)
                with open(ts_path, 'wb') as f:
                    f.write(r1.content)
    
    
    if __name__ == '__main__':
        for index_page in get_index_page():
            for detail_url in get_detail_page(index_page):
                file_name, video_url, video_type = get_url_name(detail_url)
                save(file_name, video_url, video_type)
  • 相关阅读:
    数组中出现次数超过一半的数字
    字符串的排列(important)
    #pragma mark指令的作用
    类的声明和实现
    OC比C中,新增的数据类型
    NSLog (Log信息的输出)
    C语言中文件相关操作
    static 及 extern
    C语言预处理命令
    结构体
  • 原文地址:https://www.cnblogs.com/zhangguosheng1121/p/11323211.html
Copyright © 2011-2022 走看看