zoukankan      html  css  js  c++  java
  • pressmuSpiderr

    #!/usr/bin/env python  
    # encoding: utf-8
    import requests
    from random import choice
    from lxml import html
    from urllib.parse import urljoin,quote
    import os
    import time
    NAMEURLDIC={}
    NAMEURLDIC_L2={}
    ualist=["Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
    "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
    "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)"]
    ua=choice(ualist)
    header={"User_Agent":ua}
    mailurl="https://press.mu"
    url="https://press.mu/tag"
    searc_url="https://press.mu/search/{}?p={}"
    def getpage(url):
        req=None
        try:
            req=requests.get(url=url,headers=header,stream=True)
            req.encoding=req.apparent_encoding
        except:
            pass
        return req
    def parse(url):
        source=getpage(url).text
        if len(source):
            root=html.fromstring(source)
        return root
    def buff(url):
        buff = None
        req=getpage(url)
        return req
    def save_file(title,url,type="m3u8"):
    
        if os.path.exists("pressimg"):
            pass
        else:
            os.mkdir("pressimg")
        with open(f'./pressimg/{title}.{type}',"wb") as fs:
                fs.write(buff(url).content)
    
    root=parse(url)
    taglist=root.xpath("//section[@id='tag']/ul/li/a")
    for tag in taglist:
        title=tag.xpath("./text()")[0]
        href=urljoin(mailurl,tag.xpath("./@href")[0])
        NAMEURLDIC.setdefault(title,href)
    for k,v in NAMEURLDIC.items():
        #第一页
        root=parse(v)
        #视频件数:
        v_count=root.xpath("//p[@id='hit']/strong/text()")[0]
        v_max_page_num=root.xpath("//nav[@id='pager']/ul/li[last()-1]/a/text()")[0]
        print(f'当前分类为{k}:,视频件数为:{v_count}')
        for item in range(1,int(v_max_page_num)+1):
            print(f"获取第{item}页")
            if item==1:
                pass
            else:
                root = parse(searc_url.format(quote(title.strip()),item))
            level2list=root.xpath("//section[@class='items']//h2/a")
            for level2 in level2list:
                title_level2 = level2.xpath("./text()")[0]
                href_level2 = urljoin(mailurl, level2.xpath("./@href")[0])
                NAMEURLDIC_L2.setdefault(title_level2, href_level2)
                print(title_level2,href_level2)
                root2 = parse(href_level2)
                videourl=root2.xpath("//div[@id='player']//video/source/@src")[0]
                imgurl="https:"+root2.xpath("//div[@id='player']//video/@poster")[0]
                print("videourl",videourl)
                print("imgurl",imgurl)
                save_file(title_level2,videourl)
                save_file(title_level2,imgurl,"jpg")
                print("开始下载",f"{title_level2}.jpg")
    

      

  • 相关阅读:
    vim 源码分析
    Crontab无法自动执行,直接运行脚本却能执行
    chromium源代码下载(Win7x64+VS2013sp2, 39.0.2132.2)
    linux 环境变量设置方法总结(PATH/LD_LIBRARY_PATH)
    Linux 的源码安装工具 CheckInstall
    两个开源项目要搞定
    FreeRADIUS + MySQL 安装配置笔记
    Linux指令详解useradd groupadd passwd chpasswd chage 密码修改
    Github上的几个C++开源项目
    linux 中解析命令行参数(getopt_long用法)
  • 原文地址:https://www.cnblogs.com/c-x-a/p/9055139.html
Copyright © 2011-2022 走看看