zoukankan      html  css  js  c++  java
  • 爬虫--百度贴吧每一页中的图片

    import urllib.request
    import urllib.parse
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    from lxml import etree
    
    def loadPage(url):
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
        request = urllib.request.Request(url, headers = headers)
        html = urllib.request.urlopen(request).read()
    
        #解析HTML文档为HTML_DOM模型
        content = etree.HTML(html)
        #返回所有匹配成功的列表集合
        link_list = content.xpath('//li[@class=" j_thread_list clearfix"]//div[@class="threadlist_title pull_left j_th_tit "]/a/@href')
        print(link_list,len(link_list))
    
        for link in link_list:
            fulllink = "http://tieba.baidu.com" + link      #每个帖子的链接
            loadImage(fulllink)
    
    #取出每个帖子中每个图片的的链接
    def loadImage(link):
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
        request = urllib.request.Request(link, headers=headers)
        html = urllib.request.urlopen(request).read()
        content = etree.HTML(html)
    
        #返回帖子里所有图片链接的列表集合
        link_list = content.xpath('//img[@class="BDE_Image"]/@src')
        for link in link_list:
            filename = link[-15:]
            urllib.request.urlretrieve(link,'./tieba/'+filename)
            print("下载成功"+'----'+filename)
    
    
    def tiebaSpider(url, beginPage, endPage):
    
        for page in range(beginPage, endPage + 1):
            pn = (page - 1) * 50
            fullurl = url + "&pn=" + str(pn)
            loadPage(fullurl)
    
    
    if __name__ == "__main__":
        kw = input("请输入要爬取的贴吧名:")
        startPage = int(input("请输入起始页:"))
        endPage = int(input("请输入结束页:"))
    
        url = "https://tieba.baidu.com/f?"
    
        # 可以使用urlencode({'kw':kw})  --->  https://tieba.baidu.com/f?kw=美女
        key = urllib.parse.urlencode({"kw": kw})
        fullurl = url + key
        # fullurl = url + 'kw=' +kw
        # print(fullurl)
    
        tiebaSpider(fullurl, startPage, endPage)
    
  • 相关阅读:
    348. Design Tic-Tac-Toe
    347. Top K Frequent Elements
    346. Moving Average from Data Stream
    345. Reverse Vowels of a String
    343. Integer Break
    342. Power of Four
    341. Flatten Nested List Iterator
    340. Longest Substring with At Most K Distinct Characters
    339. Nested List Weight Sum
    Python(九) Python的高级语法与用法
  • 原文地址:https://www.cnblogs.com/dongpei/p/9404640.html
Copyright © 2011-2022 走看看