zoukankan      html  css  js  c++  java
  • 爬虫--百度贴吧每一页中的图片

    import urllib.request
    import urllib.parse
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    from lxml import etree
    
    def loadPage(url):
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
        request = urllib.request.Request(url, headers = headers)
        html = urllib.request.urlopen(request).read()
    
        #解析HTML文档为HTML_DOM模型
        content = etree.HTML(html)
        #返回所有匹配成功的列表集合
        link_list = content.xpath('//li[@class=" j_thread_list clearfix"]//div[@class="threadlist_title pull_left j_th_tit "]/a/@href')
        print(link_list,len(link_list))
    
        for link in link_list:
            fulllink = "http://tieba.baidu.com" + link      #每个帖子的链接
            loadImage(fulllink)
    
    #取出每个帖子中每个图片的的链接
    def loadImage(link):
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
        request = urllib.request.Request(link, headers=headers)
        html = urllib.request.urlopen(request).read()
        content = etree.HTML(html)
    
        #返回帖子里所有图片链接的列表集合
        link_list = content.xpath('//img[@class="BDE_Image"]/@src')
        for link in link_list:
            filename = link[-15:]
            urllib.request.urlretrieve(link,'./tieba/'+filename)
            print("下载成功"+'----'+filename)
    
    
    def tiebaSpider(url, beginPage, endPage):
    
        for page in range(beginPage, endPage + 1):
            pn = (page - 1) * 50
            fullurl = url + "&pn=" + str(pn)
            loadPage(fullurl)
    
    
    if __name__ == "__main__":
        kw = input("请输入要爬取的贴吧名:")
        startPage = int(input("请输入起始页:"))
        endPage = int(input("请输入结束页:"))
    
        url = "https://tieba.baidu.com/f?"
    
        # 可以使用urlencode({'kw':kw})  --->  https://tieba.baidu.com/f?kw=美女
        key = urllib.parse.urlencode({"kw": kw})
        fullurl = url + key
        # fullurl = url + 'kw=' +kw
        # print(fullurl)
    
        tiebaSpider(fullurl, startPage, endPage)
    
  • 相关阅读:
    OSError: Initializing from file failed
    python之邮件提醒
    python之经纬度的获取
    Pandas写入CSV格式
    代码不同之处高亮显示
    Python之免费随机代理IP的获取以及使用
    正则之利用元素属性进行匹配
    时间戳的格式化
    简单实用的HTML中字符串的提取
    承接OpenCV Halcon视觉项目开发定制
  • 原文地址:https://www.cnblogs.com/dongpei/p/9404640.html
Copyright © 2011-2022 走看看