zoukankan      html  css  js  c++  java
  • 爬取百度贴吧楼主图片

    import urllib.request
    from bs4 import BeautifulSoup
    import re
    
    
    def validateTitle(title):
        rstr = r"[/\:*?"<>|]"  # '/  : * ? " < > |'
        new_title = re.sub(rstr, "_", title)  # 替换为下划线
        return new_title
    
    url = "https://tieba.baidu.com/p/5407739329?see_lz=1"
    
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    
    soup = BeautifulSoup(response,'lxml')
    
    try:
        #获取总页数
        a = soup.find(text=re.compile("回复贴"))
        total_page = a.find_next_sibling('span').string
        total_page = int(total_page)
    
        if total_page > 0 :
            for j in range(1,int(total_page) + 1):
                url = "https://tieba.baidu.com/p/5407739329?see_lz=1&pn="+str(j)
                request1 = urllib.request.Request(url)
                response1 = urllib.request.urlopen(request1)
                soup1 = BeautifulSoup(response1, 'lxml')
                title = soup1.title.string
                link = soup1.find_all('img',class_="BDE_Image")
                i = 1
                for li in link :
                    print(li.get('src'))
                    file_name = "D:/www/spider/" + validateTitle(title) + str(j) +"-"+ str(i) + ".jpg"
                    print(file_name)
                    urllib.request.urlretrieve(li.get('src'),file_name)
                    i = i + 1
    except Exception as e:
        print(e)
    

      

  • 相关阅读:
    弱鸡儿长乐爆肝旅Day8
    弱鸡儿终于没爆零Day7
    弱鸡儿长乐爆零旅Day6
    弱鸡儿长乐爆零旅Day5
    弱鸡儿长乐爆零旅Day4
    D1字符串哈希
    Tarjan算法
    弱鸡儿长乐爆零旅Day3
    弱鸡儿长乐爆零旅Day2
    弱鸡儿长乐爆零旅Day1
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8330155.html
Copyright © 2011-2022 走看看