zoukankan      html  css  js  c++  java
  • Python小练习批量爬取下载歌曲

    import requests
    import os
    
    headers={
        'Cookie': '_ga=GA1.2.701818100.1612092981; _gid=GA1.2.748589379.1612092981; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1612092982; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1612094717; kw_token=ZALW965FXG',
        'csrf': 'ZALW965FXG',
        'Host': 'www.kuwo.cn',
        'Referer': 'https://www.kuwo.cn/singer_detail/1600',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        }
    if not  os.path.exists('mics'):
        os.mkdir('mics')
    def Index(page):
        # url ='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=9d0df070-63bc-11eb-8632-19dcd503126a'
        url='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=50b03180-63ca-11eb-b714-332080487537'
    
        response = requests.get(url=url,headers=headers).json()
        musicList = response['data']['list']
        print(musicList)
        for music in musicList:
            rid=music['rid']
            name=music['name']
            musicSave(rid,name)
    
    
    def musicSave(rid,name):
        # url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612094725726&httpsStatus=1&reqId=9a3777e1-63bc-11eb-8632-19dcd503126a'
        url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612100615341&httpsStatus=1&reqId=50b38ce1-63ca-11eb-b714-332080487537'
        response=requests.get(url=url,headers=headers).json()
        mp3path=response['url']
        print(mp3path)
        data = requests.get(url=mp3path).content   *****
    
        # 文件存储
        # a 追加 b进制读写(音乐文件是字节数据)
        print(mp3path)
        with open('mics{}.mp3'.format(name),'ab') as f:
            f.write(data)
            print('{}.mp3已经下载完成',format(name))
    
    
    
    for page in range(1,11):
        Index(page)

    1.地址,文件地址和播放地址需要抓取

    2.'Cookie':  'csrf'网页刷新后需要更新,大量爬虫可以使用代理ip和伪造User-Agent,或者js逆向后续更新。

    出现错误,data = requests.get(url=mp3path).content  五颗红星 原来哪里我添加headers后get不了导致失败,后面把headers去掉后就能用了

      data = requests.get(mp3path,headers=headers).content

    1.演示一下用免费代理ip爬虫

    import urllib.request
    
    def creat_proxy_handler():
        url="https://www.baidu.com"
        # 添加代理
        proxy_list=[
            {"http":"60.168.207.219:9999"},
            {"http":"58.23.67.208:9999"},
            {"http":"42.7.28.217:9999"},
            {"http":"61.145.49.177:9999"},
            {"http":"36.250.156.78:9999"},
            {"http":"36.248.133.145:9999"},
            {"http":"42.56.238.117:9999"},
            {"http":"36.249.119.34:9999"},
            {"http":"58.22.177.60:9999"}
        ]
        for proxy in proxy_list:
            print(proxy)
            # 遍历出来的ip创建处理器
            # 代理处理器
            proxy_handler=urllib.request.ProxyHandler(proxy)
            #创建自己的opener
            opener=urllib.request.build_opener(proxy_handler)   
            try:
                # 拿着代理ip去发送请求
                data = opener.open(url,timeout=1).read()
                print("haha")
            except Exception as e:
                print(e)    
    
    creat_proxy_handler()

    带着cookie去自动登录

    import urllib.request
    from http import cookiejar
    from urllib import parse
    """
    直接获取 个人中心
    1代码登录
    2.自动带着cookies
    
    1.代码登录 
        1.1登录的网址
        login_url ='https://www.yaozh.com/login'
        1.2登录的参数
        1.3发送登录请求
    
    2.代码带着cookes 访问   
    """
    login_url ='https://www.yaozh.com/login'
    login_from_data={
        "username":"xiaomaoera12",
        "pwd":"lina081012",
        "formhash":"89B42EA5FF",
        "backurl":"https%3A%2F%2Fjob.yaozh.com%2FtopicComp%2F14"
    }
    # 1.3发送登录请求POST
    cook_jar = cookiejar.CookieJar()
    # 定义有添加cook功能的处理器
    cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar)
    # 根据处理器生成opener
    opener = urllib.request.build_opener(cook_hanlder)
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
    }
    login_str = parse.urlencode(login_from_data).encode("utf-8")
    login_request= urllib.request.Request(login_url,headers=headers,data=login_str)
    opener.open(login_request)
    
    center_url="https://www.yaozh.com/member/"
    center_request = urllib.request.Request(center_url,headers=headers)
    response = opener.open(center_url)
    
    data=response.read()
    print(data)
    with open('02cook.html','wb') as f:
        f.write(data)
  • 相关阅读:
    Yield Usage Understanding
    Deadclock on calling async methond
    How to generate file name according to datetime in bat command
    Run Unit API Testing Which Was Distributed To Multiple Test Agents
    druid的关键参数+数据库连接池运行原理
    修改idea打开新窗口的默认配置
    spring boot -thymeleaf-url
    @pathvariable和@RequestParam的区别
    spring boot -thymeleaf-域对象操作
    spring boot -thymeleaf-遍历list和map
  • 原文地址:https://www.cnblogs.com/wulianwangaxing/p/14391140.html
Copyright © 2011-2022 走看看