zoukankan      html  css  js  c++  java
  • 爬虫学习(十二)——bs4实践案例

    实践项目————诗词名句网《三国演义》小说爬取

    import os
    import re
    import time
    import urllib.request
    import urllib.parse
    from bs4 import BeautifulSoup


    def header():
    # 三国演义网址
    article_url = "http://www.shicimingju.com/book/sanguoyanyi.html"
    # 模拟浏览器创建请求头
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
    # 创建请求对象
    request = urllib.request.Request(article_url,headers=headers)
    return request

    # 发送请求
    def main(request):
    # 创建管理器对象对象
    handler = urllib.request.HTTPHandler()
    # 使用管理器对象构建请求对象
    opener = urllib.request.build_opener( handler )
    # 使用opener进行获取响应
    response = opener.open( request )
    return response

    # 下载内容
    def download():
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
    request = header()
    response = main(request).read()
    # 使用bs4对html进行解析
    article_main_html = BeautifulSoup(response,"lxml")
    if not os.path.exists("三国演义"):
    os.mkdir("三国演义")
    # 获取书名
    # article_name =article_main_html.select(".book-header h1").text
    # 获取书名详解
    # article_details =article_main_html.select(".book-summary p").text
    # 获取章节链接
    article_section = article_main_html.select(".book-mulu ul li a")
    section_title_ls = []
    section_url_ls = []
    # 将章节和章节链接有序存入列表中
    for section in article_section:
    section_title_ls.append(section.text)
    section_url_ls.append(section["href"])

    # 分章节爬取章节内容
    for num in range(0,120):
    # 同时取出章节名和章节url进行请求数据
    section_title = section_title_ls[num]
    section_url = section_url_ls[num]
    # 拼接完整的章节url
    section_allurl = "http://www.shicimingju.com"+section_url
    section_request = urllib.request.Request(section_allurl,headers=headers )
    handler = urllib.request.HTTPHandler
    opener =urllib.request.build_opener(handler)
    # 请求章节数据
    section_response = opener.open(section_request).read().decode("utf8")
    # 使用bs4对html进行解析
    article_soup =BeautifulSoup(section_response,"lxml")
    article_content = article_soup.select(".chapter_content")
    # 构建章节名并和文章组合
    content = section_title+article_content[0].text
    # 创建存储文件名
    filename ="三国演义"+".doc"
    print("正在下载第%d章"%num)
    # 将下载的数据写入文件中
    filename_path = os.path.join("三国演义",filename)
    with open(filename_path,"ab+") as tf:
    tf.write(content.encode("utf8"))
    tf.close()
    # 防止暴力请求
    time.sleep(2)

    if __name__ == '__main__':
    download()
     

    百度音乐爬取案例

    import os
    import re
    import time
    import urllib.request
    import urllib.parse
    from bs4 import BeautifulSoup
    import json

    # 面向对象爬取数据
    class BaiDuMusic( object ):

    # 初始化输入参数
    def __init__(self, singer, page):
    self.singer = singer
    self.page = page
    self.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}

    # 构建请求头信息
    def header(self):
    url = "http://music.taihe.com/search/song?"
    data = {
    "s": "1",
    "key": self.singer,
    "jump": "0",
    "start": (self.page - 1) * 20,
    "size": "20",
    "third_type": "0",
    }
    # 解析参数
    data = urllib.parse.urlencode( data )
    singer_url = url + data
    # 创建请求头
    request = urllib.request.Request( url=singer_url, headers=self.headers )
    return request

    # 创建管理器对象,请求数据
    def requset(self):
    request = self.header()
    handler = urllib.request.HTTPHandler()
    opener = urllib.request.build_opener( handler )
    response = opener.open( request )
    return response

    # bs4解析数据
    def paserSong(self):
    response = self.requset()
    singer_soup = BeautifulSoup( response, "lxml" )
    pattern=re.compile(r'[d]+')
    # bs4匹配目标标签<li>
    songs_info =singer_soup.find_all(name="li", attrs={"data-albumid":pattern})
    # 获取<li>标签中的”data-songitem“属性,并将属性值转成json格式
    song_ls =[json.loads(li["data-songitem"]) for li in songs_info]
    song_info=[(song_info["songItem"]["sname"],song_info["songItem"]["sid"]) for song_info in song_ls]
    # print(song_info)
    # 输出结果如下,获取歌曲id
    # """[('只要平凡', 598740690), ('My Sunshine', 127018924), ('听', 123192697), ('微笑着胜利(庆祝建军91周年网宣主题曲)', 601622060), ('Lost In The Stars', 268791350), ('Everything Will Say Goodbye', 285312563), ('《星辰》——电视剧《择天记》片头曲', 609686640), ('听', 123206622), ('Give You My World', 277779153), ('微笑着胜利(庆祝建军91周年网宣主题曲)(伴奏)', 601622061), ('My Sunshine', 131096021), ('三生三世', 537883379), ('着魔', 53603708), ('三生三世', 537883380), ('Torches', 541943830), ('浩瀚', 124796979), ('逆战', 14944589), ('剑心', 121223583), ('天下', 1103789), ('燕归巢', 136982116)]"""
    return song_info
    def downloadSong(self):
    if not os.path.exists('music'):
    os.mkdir('music')
    song_info =self.paserSong()
    for song_detail in song_info:
    song_info_id=song_detail[1]
    song_info_name=song_detail[0]
    print("正在下载第%s页的:%s的《%s》"%(self.page,self.singer,song_info_name))
    # 通过该API接口获取歌曲信息的json格式数据
    song_url ='http://tingapi.ting.baidu.com/v1/restserver/ting?method=baidu.ting.song.play&format=jsonp&callback=jQuery17202741599001012014_1513517333931&songid=%s&_=1513517334915'%song_info_id
    # 获取请求
    request_song_detail =urllib.request.urlopen(song_url)
    # 解析json歌曲数据
    pattern_song =re.compile(r'((.*))',re.S)
    json_song_info=pattern_song.findall(request_song_detail.read().decode("utf8"))
    # 将字符串数据转化成json数据,便于提取下载路径
    lrclink=json.loads(json_song_info[0])["songinfo"]["lrclink"]
    file_link =json.loads(json_song_info[0])["bitrate"]["file_link"]
    # 创建文件格式保存文件
    filename_music=song_info_name+"_%s.mp3"%self.singer
    filename_lrc =song_info_name+"_%s.lrc"%self.singer
    song_path = os.path.join("music",filename_music)
    lrc_path = os.path.join("music",filename_lrc)
    try:
    # 下载歌曲和歌词数据
    urllib.request.urlretrieve(lrclink,lrc_path)
    urllib.request.urlretrieve( file_link, song_path )
    time.sleep(1)
    print("《%s》下载完成"%song_info_name)
    except Exception as e:
    print("因版权受限无法下载")




    # 录入爬取信息
    def main():
    singer = input( "请输入爬取的歌手或是歌名:" )
    start_page = int( input( "请输入爬取的开始页:" ) )
    end_page = int( input( "请输入爬取的终止页:" ) )
    for page in range( start_page, end_page + 1 ):
    baidumusic = BaiDuMusic( singer, page )
    if page>end_page+1:
    print("%s歌手的所有歌曲都已下载完毕"%singer)
    baidumusic.downloadSong()

    # 运行
    if __name__ == '__main__':
    main()
  • 相关阅读:
    IOS开发---菜鸟学习之路--(二十四)-iOS7View被导航栏遮挡问题的解决
    IOS开发---菜鸟学习之路--(二十三)-直接利用键值对的方式来处理数据的感想
    IOS开发---菜鸟学习之路--(二十二)-近期感想以及我的IOS学习之路
    一口一口吃掉Hexo(六)
    一口一口吃掉Hexo(二)
    setSupportActionBar(toolbar)导致程序崩溃闪退
    【原创+译文】官方文档中声明的如何创建抽屉导航栏(Navigation Drawer)
    【文章内容来自《Android 应用程序开发权威指南》(第四版)】如何设计兼容的用户界面的一些建议(有删改)
    如何为按钮设置一组不同状态的颜色
    如何避免Activity 被杀死
  • 原文地址:https://www.cnblogs.com/kuangkuangduangduang/p/10390941.html
Copyright © 2011-2022 走看看