zoukankan      html  css  js  c++  java
  • Python小练习批量爬取下载歌曲

    import requests
    import os
    
    headers={
        'Cookie': '_ga=GA1.2.701818100.1612092981; _gid=GA1.2.748589379.1612092981; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1612092982; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1612094717; kw_token=ZALW965FXG',
        'csrf': 'ZALW965FXG',
        'Host': 'www.kuwo.cn',
        'Referer': 'https://www.kuwo.cn/singer_detail/1600',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        }
    if not  os.path.exists('mics'):
        os.mkdir('mics')
    def Index(page):
        # url ='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=9d0df070-63bc-11eb-8632-19dcd503126a'
        url='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=50b03180-63ca-11eb-b714-332080487537'
    
        response = requests.get(url=url,headers=headers).json()
        musicList = response['data']['list']
        print(musicList)
        for music in musicList:
            rid=music['rid']
            name=music['name']
            musicSave(rid,name)
    
    
    def musicSave(rid,name):
        # url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612094725726&httpsStatus=1&reqId=9a3777e1-63bc-11eb-8632-19dcd503126a'
        url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612100615341&httpsStatus=1&reqId=50b38ce1-63ca-11eb-b714-332080487537'
        response=requests.get(url=url,headers=headers).json()
        mp3path=response['url']
        print(mp3path)
        data = requests.get(url=mp3path).content   *****
    
        # 文件存储
        # a 追加 b进制读写(音乐文件是字节数据)
        print(mp3path)
        with open('mics{}.mp3'.format(name),'ab') as f:
            f.write(data)
            print('{}.mp3已经下载完成',format(name))
    
    
    
    for page in range(1,11):
        Index(page)

    1.地址,文件地址和播放地址需要抓取

    2.'Cookie':  'csrf'网页刷新后需要更新,大量爬虫可以使用代理ip和伪造User-Agent,或者js逆向后续更新。

    出现错误,data = requests.get(url=mp3path).content  五颗红星 原来哪里我添加headers后get不了导致失败,后面把headers去掉后就能用了

      data = requests.get(mp3path,headers=headers).content

    1.演示一下用免费代理ip爬虫

    import urllib.request
    
    def creat_proxy_handler():
        url="https://www.baidu.com"
        # 添加代理
        proxy_list=[
            {"http":"60.168.207.219:9999"},
            {"http":"58.23.67.208:9999"},
            {"http":"42.7.28.217:9999"},
            {"http":"61.145.49.177:9999"},
            {"http":"36.250.156.78:9999"},
            {"http":"36.248.133.145:9999"},
            {"http":"42.56.238.117:9999"},
            {"http":"36.249.119.34:9999"},
            {"http":"58.22.177.60:9999"}
        ]
        for proxy in proxy_list:
            print(proxy)
            # 遍历出来的ip创建处理器
            # 代理处理器
            proxy_handler=urllib.request.ProxyHandler(proxy)
            #创建自己的opener
            opener=urllib.request.build_opener(proxy_handler)   
            try:
                # 拿着代理ip去发送请求
                data = opener.open(url,timeout=1).read()
                print("haha")
            except Exception as e:
                print(e)    
    
    creat_proxy_handler()

    带着cookie去自动登录

    import urllib.request
    from http import cookiejar
    from urllib import parse
    """
    直接获取 个人中心
    1代码登录
    2.自动带着cookies
    
    1.代码登录 
        1.1登录的网址
        login_url ='https://www.yaozh.com/login'
        1.2登录的参数
        1.3发送登录请求
    
    2.代码带着cookes 访问   
    """
    login_url ='https://www.yaozh.com/login'
    login_from_data={
        "username":"xiaomaoera12",
        "pwd":"lina081012",
        "formhash":"89B42EA5FF",
        "backurl":"https%3A%2F%2Fjob.yaozh.com%2FtopicComp%2F14"
    }
    # 1.3发送登录请求POST
    cook_jar = cookiejar.CookieJar()
    # 定义有添加cook功能的处理器
    cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar)
    # 根据处理器生成opener
    opener = urllib.request.build_opener(cook_hanlder)
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
    }
    login_str = parse.urlencode(login_from_data).encode("utf-8")
    login_request= urllib.request.Request(login_url,headers=headers,data=login_str)
    opener.open(login_request)
    
    center_url="https://www.yaozh.com/member/"
    center_request = urllib.request.Request(center_url,headers=headers)
    response = opener.open(center_url)
    
    data=response.read()
    print(data)
    with open('02cook.html','wb') as f:
        f.write(data)
  • 相关阅读:
    Debate
    图形算法
    OpenGL Notes
    How to Write an Ethics Paper
    Thesis
    addWindowListener -> WindowAdapter -> windowClosing
    Thesis
    Bootcamp: An error occurred while partitioning the disk
    What Is XML Schema
    What Is XML
  • 原文地址:https://www.cnblogs.com/wulianwangaxing/p/14391140.html
Copyright © 2011-2022 走看看