zoukankan      html  css  js  c++  java
  • python爬虫小实例

    1、python爬取贴吧壁纸
    
    1.1、获取整个页面数据
    
    #coding=utf-8
    import urllib
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html
    
    html = getHtml("http://tieba.baidu.com/p/2738151262")
    
    print html
    复制代码
    
    
    
    
    1.2、筛选页面中想要的数据
    
    import re
    import urllib
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html
    
    def getImg(html):
        reg = r'src="(.+?.jpg)" '
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        return imglist      
           
    html = getHtml("http://tieba.baidu.com/p/2460150866")
    print getImg(html)
    
    
    
    1.3、将页面筛选的数据保存到本地
    
    
    #coding=utf-8
    import urllib
    import re
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html
    
    def getImg(html):
        reg = r'src="(.+?.jpg)" '
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x = 0
        for imgurl in imglist:
            urllib.urlretrieve(imgurl,'%s.jpg' % x)
            x+=1
    
    
    html = getHtml("http://tieba.baidu.com/p/2460150866")
    
    print getImg(html)
    
    
    抓取昵图网图片 --修改版
    
    #coding=utf-8
    import urllib
    import re
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html
    
    def getImg(html):
        reg = r'src="(.*?)" '
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x = 0
        for imgurl in imglist:
            urllib.urlretrieve(imgurl,'D:360\%s.jpg' % x)
            x+=1
    
    
    html = getHtml("http://www.nipic.com/show/17742538.html")
    
    print getImg(html)
    
    
    
    解释:
    
    
    %s意思是字符串参数,就是将变量的值传入到字符串里面,字符串后的'%'后就是写要传入的参数。
    在你给出的例子中,就是用x的值替代%s。比如说x=5,那么就是爬取url后面是'5.jpg'这个图片
    
    
    保存的位置默认为程序的存放目录
    
    
    如何保存到指定目录:urllib.urlretrieve(imgurl,'D:360\%s.jpg' % x)
    
    
    https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=false&word
    
    
    
    2、python抓取价格
    
    前两个不用加 text
    
    
    #-*—coding:utf8-*-
    from lxml import etree
    
    import urllib
    import urllib.request
    #headers构造一个字典,里面保存了user-agent
    #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
    url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
    html = urllib.request.urlopen(url).read()
    data=html.decode('utf-8')
    selector = etree.HTML(data)
    #xpath
    qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
    #print(qiubai_text)
    for i in qiubai_text:
        print(i)
    
    
    或者
    
    
    #-*—coding:utf8-*-
    from lxml import etree
    
    import urllib
    import urllib.request
    #headers构造一个字典,里面保存了user-agent
    #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
    url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
    html = urllib.request.urlopen(url).read()
    selector = etree.HTML(html)
    #xpath
    qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
    #print(qiubai_text)
    for i in qiubai_text:
        print(i)
    
    
    
    或者    :注意:这个需要加text         html.text
    
    
    #-*—coding:utf8-*-
    from lxml import etree
    import requests
    #headers构造一个字典,里面保存了user-agent
    #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
    url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
    html = requests.get(url)
    selector = etree.HTML(html.text)
    #xpath
    qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
    #print(qiubai_text)
    for i in qiubai_text:
        print(i)
    
    
    3、python爬取昵图网图片
    
    #coding=utf-8
    import urllib
    import re
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html
    
    def getImg(html):
        reg = r'src="(.*?)" '
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x = 0
        for imgurl in imglist:
            urllib.urlretrieve(imgurl,'D:360\%s.jpg' % x)
            x+=1
    
    
    html = getHtml("http://www.nipic.com/show/17742538.html")
    
    print getImg(html)
    
    
    4、爬音乐
    
    
    # coding:utf-8
    import urllib
    import urllib.request
    import re
    url="http://www.yy8844.cn/ting/ccceo/ceeivi.shtml"
    html = urllib.request.urlopen(url).read()
    data=html.decode('GBK')
    #print(data)
    music_id = int(re.findall(r'MusicId=(d+)',data)[0])
    music_name = re.findall(r'<title>(.*?)</title>',data)[0].split('/')[0].strip()
    music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]
    article='word'
    with open("%s.txt" % article,'w') as f:
        f.write(music_word)
    #print(music_word)
    quanurl="http://96.ierge.cn/"'%d/%d/%s' % (music_id//30000,music_id//2000,music_id)+".mp3"
    #print(quanurl)
    bata=urllib.request.urlopen(quanurl).read()
    with open("%s.mp3" % music_name,'wb') as f:
        f.write(bata)
    
    
    注意问题:
    
    music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]
    
    
    python中AttributeError解决
    
    【Python 脚本报错】AttributeError:'module' has no attribute 'xxx'的解决方法
    http://blog.csdn.net/cn_wk/article/details/50839159
    
    
    int库的.pyc文件
    
    
    python 去掉 .pyc
    http://blog.csdn.net/ubuntu64fan/article/details/48241985
    
    
    python操作对象属性
    http://www.jianshu.com/p/c38a81b8cb38
    
    
    Python学习日记4|python爬虫常见报错小结及解决方法
    
    http://www.jianshu.com/p/17c921639ad0
    
    
    
    
    
    #coding=utf-8
    from Tkinter import *
    import  tkMessageBox
    import urllib
    import json
    import mp3play
    import time
    import threading
    from pinyin import PinYin
    import os
    import stat
    test = PinYin()
    test.load_word()
    stop=0
    def music():
        if not entry.get():
            tkMessageBox.showinfo("温馨提示","搜索内容不能为空")
            return
        name = test.hanzi2pinyin_split(entry.get())
        html=urllib.urlopen("http://s.music.163.com/search/get/?type=1&s=%s&limit=9"%name).read()
        js=json.loads(html)
        n = 0
        global x
        x = []
        for i in js['result']['songs']:
            listbox.insert(n,'%s(%s)'%(i['name'],i['artists'][0]['name']))
            n+=1
            x.append(i['audio'])
    count = 0
    #isplaying = None
    def play():
        global count
        count += 1
        index=listbox.curselection()
        var1.set(u"正在加载"+listbox.get(index,last=None))
        urllib.urlretrieve(x[index[0]],'tmp%s.mp3'%str(count))
        var1.set(u"正在播放"+listbox.get(index,last=None))
        mp3=mp3play.load("tmp%s.mp3"%str(count))
        mp3.play()
        time.sleep(mp3.seconds())
    
    import inspect
    import ctypes
    
    def _async_raise(tid, exctype):
        """raises the exception, performs cleanup if needed"""
        tid = ctypes.c_long(tid)
        if not inspect.isclass(exctype):
            exctype = type(exctype)
        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
        if res == 0:
            raise ValueError("invalid thread id")
        elif res != 1:
            ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
            raise SystemError("PyThreadState_SetAsyncExc failed")
    
    def stop_thread(thread):
        _async_raise(thread.ident, SystemExit)
    threads=list()
    t=None
    def excute(event):
        global  t
        for i in threads:
            stop_thread(i)
        t = threading.Thread(target=play)
        t.setDaemon(True)
        t.start()
        threads.append(t)
    root = Tk()#创建一个窗口
    root.title("云音乐")
    root.geometry("500x300+500+200")
    entry=Entry(root)#创建输入框(单行),置父
    entry.pack()
    btn=Button(root,text="搜 索",command=music)
    btn.pack()#布局方式必须用同一种
    var=StringVar()
    listbox=Listbox(root,width=50,listvariable=var)
    listbox.bind('<Double-Button-1>',excute)
    listbox.pack()
    var1=StringVar()
    label=Label(root,text="云音乐播放器",fg="purple",textvariable=var1)
    var1.set("云音乐播放器")
    label.pack()
    root.mainloop()#显示窗口
  • 相关阅读:
    Django之admin
    CSS弹性盒子
    SQL SERVER按多字段查找重复的数据并删除只保留一条
    计算机名称改名之后,tfs连接问题
    Docker镜像仓库Harbor部署
    搭建docker本地仓库
    部署docker swarm集群
    Dockerfile
    centos 7 安装docker 常用指令
    python软件安装-Windows
  • 原文地址:https://www.cnblogs.com/effortsing/p/10049469.html
Copyright © 2011-2022 走看看