zoukankan html css js c++ java

Python小练习批量爬取下载歌曲

import requests
import os

headers={
    'Cookie': '_ga=GA1.2.701818100.1612092981; _gid=GA1.2.748589379.1612092981; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1612092982; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1612094717; kw_token=ZALW965FXG',
    'csrf': 'ZALW965FXG',
    'Host': 'www.kuwo.cn',
    'Referer': 'https://www.kuwo.cn/singer_detail/1600',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
    }
if not  os.path.exists('mics'):
    os.mkdir('mics')
def Index(page):
    # url ='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=9d0df070-63bc-11eb-8632-19dcd503126a'
    url='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=50b03180-63ca-11eb-b714-332080487537'

    response = requests.get(url=url,headers=headers).json()
    musicList = response['data']['list']
    print(musicList)
    for music in musicList:
        rid=music['rid']
        name=music['name']
        musicSave(rid,name)


def musicSave(rid,name):
    # url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612094725726&httpsStatus=1&reqId=9a3777e1-63bc-11eb-8632-19dcd503126a'
    url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612100615341&httpsStatus=1&reqId=50b38ce1-63ca-11eb-b714-332080487537'
    response=requests.get(url=url,headers=headers).json()
    mp3path=response['url']
    print(mp3path)
    data = requests.get(url=mp3path).content   *****

    # 文件存储
    # a 追加 b进制读写（音乐文件是字节数据）
    print(mp3path)
    with open('mics{}.mp3'.format(name),'ab') as f:
        f.write(data)
        print('{}.mp3已经下载完成',format(name))



for page in range(1,11):
    Index(page)

1.地址，文件地址和播放地址需要抓取

2.'Cookie': 'csrf'网页刷新后需要更新，大量爬虫可以使用代理ip和伪造User-Agent，或者js逆向后续更新。

出现错误，data = requests.get(url=mp3path).content 五颗红星原来哪里我添加headers后get不了导致失败，后面把headers去掉后就能用了

  data = requests.get(mp3path,headers=headers).content

1.演示一下用免费代理ip爬虫

import urllib.request

def creat_proxy_handler():
    url="https://www.baidu.com"
    # 添加代理
    proxy_list=[
        {"http":"60.168.207.219:9999"},
        {"http":"58.23.67.208:9999"},
        {"http":"42.7.28.217:9999"},
        {"http":"61.145.49.177:9999"},
        {"http":"36.250.156.78:9999"},
        {"http":"36.248.133.145:9999"},
        {"http":"42.56.238.117:9999"},
        {"http":"36.249.119.34:9999"},
        {"http":"58.22.177.60:9999"}
    ]
    for proxy in proxy_list:
        print(proxy)
        # 遍历出来的ip创建处理器
        # 代理处理器
        proxy_handler=urllib.request.ProxyHandler(proxy)
        #创建自己的opener
        opener=urllib.request.build_opener(proxy_handler)   
        try:
            # 拿着代理ip去发送请求
            data = opener.open(url,timeout=1).read()
            print("haha")
        except Exception as e:
            print(e)    

creat_proxy_handler()

带着cookie去自动登录

import urllib.request
from http import cookiejar
from urllib import parse
"""
直接获取 个人中心
1代码登录
2.自动带着cookies

1.代码登录 
    1.1登录的网址
    login_url ='https://www.yaozh.com/login'
    1.2登录的参数
    1.3发送登录请求

2.代码带着cookes 访问   
"""
login_url ='https://www.yaozh.com/login'
login_from_data={
    "username":"xiaomaoera12",
    "pwd":"lina081012",
    "formhash":"89B42EA5FF",
    "backurl":"https%3A%2F%2Fjob.yaozh.com%2FtopicComp%2F14"
}
# 1.3发送登录请求POST
cook_jar = cookiejar.CookieJar()
# 定义有添加cook功能的处理器
cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar)
# 根据处理器生成opener
opener = urllib.request.build_opener(cook_hanlder)
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
}
login_str = parse.urlencode(login_from_data).encode("utf-8")
login_request= urllib.request.Request(login_url,headers=headers,data=login_str)
opener.open(login_request)

center_url="https://www.yaozh.com/member/"
center_request = urllib.request.Request(center_url,headers=headers)
response = opener.open(center_url)

data=response.read()
print(data)
with open('02cook.html','wb') as f:
    f.write(data)

查看全文

相关阅读:
Yield Usage Understanding
Deadclock on calling async methond
How to generate file name according to datetime in bat command
Run Unit API Testing Which Was Distributed To Multiple Test Agents
druid的关键参数+数据库连接池运行原理
 修改idea打开新窗口的默认配置
 spring boot -thymeleaf-url
@pathvariable和@RequestParam的区别
 spring boot -thymeleaf-域对象操作
 spring boot -thymeleaf-遍历list和map

原文地址：https://www.cnblogs.com/wulianwangaxing/p/14391140.html