zoukankan      html  css  js  c++  java
  • python随笔(一)

    python爬虫获取QQ音乐和豆瓣的最新电影音乐名字

    先上代码开源大家一起学习,代码如下:

    #!python2
    #coding:utf-8
    __author__ = 'OldHarry'
    
    import urllib2
    import os
    import re
    import json
    import xlsxwriter
    import sys
    defaultencoding = 'utf-8'
    if sys.getdefaultencoding() != defaultencoding:
        reload(sys)
        sys.setdefaultencoding(defaultencoding)
    
    def getHtml(url):
        send_headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
         'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Connection':'keep-alive'
        }
        urls = urllib2.Request(url,headers=send_headers)
        html = urllib2.urlopen(urls)
        if html.getcode() == 200:
            pass
            #print ("已捕获"),url,"目标站数据..."
        else:
            print ("访问出现错误...错误代码:"),html.getcode()
        return html.read()
    
    
    def kugoumusic(url):
        xx=getHtml(url)
        rr=re.compile(r'<span class="songName">(.*?) - (.*?)</span>')
        x=rr.findall(xx)
        nk=[]
        for xxx in  x:
            if xxx not in nk:
                nk.append(xxx[1].decode('utf8'))
        print json.dumps(nk, encoding="UTF-8", ensure_ascii=False)
        return nk
    
    
    def qqmusic(url):
        xx=getHtml(url)
        rr=re.compile(r'{"action":{"alert":[0-9]+,"icons":[0-9]+,"msgdown":[0-9]+,"msgfav":[0-9]+,"msgid":[0-9]+,"msgpay":[0-9]+,"msgshare":[0-9]+,"switch":[0-9]+},"album":{"id":[0-9]+,"mid":"[a-zA-Z0-9]+","name":"(.*?)"')
        x=rr.findall(xx)
        nq=[]
        for xxx in x:
            xxx.strip()
            if xxx not in nq:
                nq.append(xxx)
        print json.dumps(nq, encoding="UTF-8", ensure_ascii=False)
        return nq
    
    def dbmovie(url):
        ssd = getHtml(url)
        tt=re.compile(r'alt="(.*?)" rel="[a-z]+" class="" />')
        shu=tt.findall(ssd)
        print json.dumps(shu, encoding="UTF-8", ensure_ascii=False)
        return shu
    
    def rmmovie(url):
        ssd = getHtml(url)
        tt=re.compile(r'"title":"(.*?)"')
        shu=tt.findall(ssd)
        print json.dumps(shu, encoding="UTF-8", ensure_ascii=False)
        return shu
    def rmdsj():
        ssd = rmmovie('https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0')+rmmovie('https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20')+rmmovie('https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40')
        return ssd
    
    def runtest():
        IP_PATH = os.path.abspath('.') + 'TXT.xls'
        print "酷狗音乐--新歌榜"
        a=kugoumusic("http://www.kugou.com/")
        print "腾讯音乐--内地新歌榜"
        b=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom2388477980207393&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A1%7D%7D%7D")
        print "腾讯音乐--港台新歌榜"
        c=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom6698628102261504&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A2%7D%7D%7D")
        print "腾讯音乐--欧美新歌榜"
        d=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom08419989487702839&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A3%7D%7D%7D")
        print "腾讯音乐--日本新歌榜"
        e=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom24411354608866187&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A4%7D%7D%7D")
        print "腾讯音乐--韩国新歌榜"
        f=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom909302436024819&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A5%7D%7D%7D")
        print "豆瓣电影--正在热映"
        g=dbmovie("https://movie.douban.com/")
        print "豆瓣电影--热门电影"
        h=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0")
        print "豆瓣电影--最新电影"
        i=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%9C%80%E6%96%B0&page_limit=20&page_start=0")
        print "豆瓣电影--经典电影"
        j=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BB%8F%E5%85%B8&sort=time&page_limit=20&page_start=0")
        print "豆瓣电影--可播放电影"
        k=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8F%AF%E6%92%AD%E6%94%BE&sort=time&page_limit=20&page_start=0")
        print "豆瓣电影--高分电影"
        l=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=time&page_limit=20&page_start=0")
        print "豆瓣电影--热门电视剧"
        m=rmdsj()
        workbook = xlsxwriter.Workbook(IP_PATH)
        worksheet = workbook.add_worksheet()
        bold = workbook.add_format({'bold': 1, 'align': 'center', 'border': 1})
        bold2 = workbook.add_format({'align': 'center', 'border': 1})
        headings = ['酷狗音乐--新歌榜', '腾讯音乐--内地新歌榜','腾讯音乐--港台新歌榜','腾讯音乐--欧美新歌榜','腾讯音乐--日本新歌榜','腾讯音乐--韩国新歌榜','豆瓣电影--正在热映','豆瓣电影--热门电影','豆瓣电影--最新电影','豆瓣电影--经典电影','豆瓣电影--可播放电影','豆瓣电影--高分电影','豆瓣电影--热门电视剧']
        worksheet.write_row('A1', headings, bold)
        SS=30
        worksheet.set_column('A:A', SS)
        worksheet.set_column('B:B', SS)
        worksheet.set_column('C:C', SS)
        worksheet.set_column('D:D', SS)
        worksheet.set_column('E:E', SS)
        worksheet.set_column('F:F', SS)
        worksheet.set_column('G:G', SS)
        worksheet.set_column('H:H', SS)
        worksheet.set_column('I:I', SS)
        worksheet.set_column('J:J', SS)
        worksheet.set_column('K:K', SS)
        worksheet.set_column('L:L', SS)
        worksheet.set_column('M:M', SS)
        worksheet.write_column('A2', a, bold2)
        worksheet.write_column('B2', b, bold2)
        worksheet.write_column('C2', c, bold2)
        worksheet.write_column('D2', d, bold2)
        worksheet.write_column('E2', e, bold2)
        worksheet.write_column('F2', f, bold2)
        worksheet.write_column('G2', g, bold2)
        worksheet.write_column('H2', h, bold2)
        worksheet.write_column('I2', i, bold2)
        worksheet.write_column('J2', j, bold2)
        worksheet.write_column('K2', k, bold2)
        worksheet.write_column('L2', l, bold2)
        worksheet.write_column('M2', m, bold2)
        workbook.close()
    if __name__ == '__main__':
        runtest()

    主要思路是:第一步解析网站,第二步选择自己想要的数据,第三步在当前文件夹生成一个文件夹写入excl

     第一次写博客,各路大神不喜勿喷,python萌新一枚。

    开发环境:Pycharm  python2.7

    2019-04-0411:33:23

    Study hard and make progress every day!

    萌新签到
  • 相关阅读:
    七牛云上传图片
    找到当前字符串中最后一个/并获取之后的字符串
    jquery正则表达式验证:验证身份证号码
    apply()与call()的区别
    js 判断字符串是否包含某字符串,String对象中查找子字符,indexOf
    改变父元素的透明度,不影响子元素的透明度—css
    c实现生产者消费者问题。 windows下。
    python基础练习 dict切片
    html+css test1
    codewars[7]-python Friend or Foe?
  • 原文地址:https://www.cnblogs.com/Harrydz/p/10653926.html
Copyright © 2011-2022 走看看