zoukankan      html  css  js  c++  java
  • 运用python3中的urllib爬取贴吧的图片

      运用python3中的urllib爬取贴吧的图片:

    import urllib
    import urllib.request
    import lxml
    import lxml.etree
    import re
    from urllib import parse
    #抓取贴吧页面数量信息
    def gettiebalistnumbers(name):    #计算搜索的关键词有多少页  输入名字  返回页数
        url="https://tieba.baidu.com/f?"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"}  # header 字典形式
        word = {"kw": name}  # 接口   贴吧的名字
        word = parse.urlencode(word)  # 编码成字符串
        url = url + word  # 拼接url
        request = urllib.request.Request(url, headers=headers)  # 发送请求
        # 也可以通过调用Request.add_header()  添加/修改一个特定的  header
        request.add_header("Connection", "keep-alive")  # 一直活着
        response = urllib.request.urlopen(request)  # 打开请求
        data = response.read().decode("utf-8")  # 读取数据
        print(response.code)  # 可以查看相应状态码
    
        restr = "<span class="card_infoNum">([sS]*?)</span>"  # 正则这个贴吧有多少帖子
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)  #寻找页面所有符合条件的
        tienumbers = mylist[0].replace(",","")  #替换逗号
        tienumbers = eval(tienumbers)   #str转化为数字
    
        restr = "<span class="card_menNum">([sS]*?)</span>"  # 正则关注贴吧的数
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)  # 寻找页面所有符合条件的
        Peoplenumbers = mylist[0].replace(",", "")  # 替换逗号
        Peoplenumbers = eval(Peoplenumbers)  # str转化为数字
        return tienumbers,Peoplenumbers
    
    def gettiebalist(name):   #抓取所有的符合name的页数  输入搜索关键词,返回所有的页数url
        numberstuple=gettiebalistnumbers(name)   #(元组)
        tienumbers=numberstuple[1]   #帖子的数量
        word = {"kw": name}  # 接口   贴吧的名字
        word = parse.urlencode(word)  # 编码成字符串
        tiebalist = []
        if tienumbers%53==0:   #生成页面列表
            for i in range(tienumbers//53):
                tiebalist.append("https://tieba.baidu.com/f?"+word+"&pn="+str(i*50))
        else:
            for i in range(tienumbers//53+1):
                tiebalist.append("https://tieba.baidu.com/f?"+word+"&pn="+str(i*50))
        #print(tiebalist)
        return tiebalist
    def geturllistformpage(url):  # 抓取页面的每个帖子url  输入一页url 返回列表内的的所有url
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
        request = urllib.request.Request(url, headers=headers)  # 发起请求,
        # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header
        response = urllib.request.urlopen(request)
        data = response.read().decode("utf-8", "ignore")  # 打开请求,抓取数据
        # print(response.code)  # 可以查看响应状态码
    
        restr = "<ul id="thread_list" class="threadlist_bright j_threadlist_bright">([sS]*?)<div class="thread_list_bottom clearfix">"  # 正则表达式,()只要括号内的数据
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)
        # print(mylist[0])#抓取整个表格
    
        restr = "href="/p/(d+)""  # 正则表达式,()只要括号内的数据
        regex = re.compile(restr, re.IGNORECASE)
        urltitlelist = regex.findall(data)  #抓取的url变化的数字
        urllist = []
        for title in urltitlelist:
            urllist.append("http://tieba.baidu.com/p/" + title)  # 拼接链接 得到每个页面的帖子url列表
        return urllist
    
    def urllistfrompage(url):  #一个帖子页面的所有页数,输入一个帖子url 返回一个帖子所有页数的url列表
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"}  # header 字典形式
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        data = response.read()#.decode("utf-8","ignore")
        mytree=lxml.etree.HTML(data)
        numbers=eval(mytree.xpath("//*[@class ="l_reply_num"]//span[last()]/text()")[0])
        urllist=[]
        for i in range(1,numbers+1):
            urllist.append(url+"?pn="+str(i))
        return urllist
    def getjpglistfrompage(url):  #输入一个分页的url  提取所有的图片url 并保存到本地
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"}  # header 字典形式
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        data = response.read()
        mytree=lxml.etree.HTML(data)
        jpgurllist=mytree.xpath("//*[@class="BDE_Image"]/@src")
        return jpgurllist
    
    name="关晓彤"
    jpgnumbers=0
    for souurl in gettiebalist(name):
        sousurl=geturllistformpage(souurl)
        for fenurl in sousurl:
            jpgallurl=urllistfrompage(fenurl)
            for rev in jpgallurl:
                for jpgurl in getjpglistfrompage(rev):
                    jpgnumbers += 1
                    urllib.request.urlretrieve(jpgurl, "jpg/" + str(jpgnumbers) + ".jpg")
  • 相关阅读:
    软件技术发展的几个阶段
    MOOONscheduler核心设计图(初稿)
    Write Read Writeln Readln console
    Win32Check对Windows操作 注销 重新启动 关闭计算机_Win32Check
    WM_nclButtonDblClk响应标题栏事件_message
    使用 “+”号实现多个字符串的连接
    TRichEdit_控制TRichEdit组件滚动
    取得字符串中指定的字符str[]
    undo RichEdit1
    使Memo 原有的右键功能失效 _OnContextPopup
  • 原文地址:https://www.cnblogs.com/my-global/p/12460855.html
Copyright © 2011-2022 走看看