zoukankan      html  css  js  c++  java
  • Python: 爬取百度贴吧图片

    练习之代码片段,以做备忘:

    # encoding=utf8
    
    from __future__ import unicode_literals
    import urllib, urllib2
    import re
    import os
    import threading
    
    
    def get_html(url):
        try:
            url = url.encode('utf-8')
            page = urllib2.urlopen(url)
            return page.read()
        except Exception as ex:
            print 'get url_%s html error, ex=%s' % (url, ex)
    
    
    def get_images(url):
        url = url.encode('utf-8')
        html = get_html(url)
        pattern = r'<img.+class="BDE_Image".+src="([^"]+.jpg)"'
        img_list = re.findall(pattern, html)
        pattern = r'<img.+src="([^"]+.jpg)".+class="BDE_Image"'
        img_list.extend(re.findall(pattern, html))
        # 去重
        img_list = sorted(set(img_list), key=img_list.index)
        return img_list
    
    
    # 指定主题页面之总页面数
    def get_page_count(url):
        html = get_html(url)
        pattern = r'"total_page":(d+)'
        m = re.search(pattern, html)
        return m.group(1) if m else 0
    
    
    # 获取每页主题url列表
    def get_page_urls(html):
        pattern = r'<a href="/p/(d+)"'
        url_list = re.findall(pattern, html)
        if url_list:
            url_list = map(lambda x: 'https://tieba.baidu.com/p/%s' % x, url_list)
        return url_list
    
    
    # 下载指定页面之图片
    def download_page_images(page_url):
        html = get_html(page_url)
        title = re.search(r'(?<=<title>)(.*)(?=</title>)', html).group(1)
        print title
        page_no = re.search(r'(d+)', page_url).group(0)
        page_count = int(get_page_count(page_url))
        print 'page: %s, page_count: %d' % (page_no, page_count)
    
        for page_idx in range(1, page_count + 1):
            url = page_url + '?pn=%d' % page_idx
            img_list = get_images(url)
            if img_list:
                print 'page index: %d, image_count: %d' % (page_idx, len(img_list))
                if not os.path.exists('images'):
                    os.mkdir('images')
    
                img_folder = 'images\%s' % page_no
                if not os.path.exists(img_folder):
                    os.mkdir(img_folder)
                idx = 0
                for img_url in img_list:
                    img_filename = img_folder + '\%d_%d.jpg' % (page_idx, idx)
                    if not os.path.exists(img_filename):
                        urllib.urlretrieve(img_url, img_filename)
                    idx += 1
    
    
    def main():
        # 扒取最大页数
        max_pagecount = 30
        base_url = r'https://tieba.baidu.com/f?kw=图片&ie=utf-8?pn=%s'
    
        # 分页而扒
        for idx in range(1, max_pagecount):
            url = base_url % ((idx - 1) * 50)
            html = get_html(url)
            url_list = get_page_urls(html)
            for page_url in url_list:
                try:
                    download_page_images(page_url)
                    threading._sleep(2)
                except:
                    continue
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    IntelliJ IDEA快捷键
    Find Minimum in Rotated Sorted Array
    爬取淘宝交易记录的爬虫
    MR并行算法编程过程中遇到问题的思考
    Map.Entry用法示例
    给定一组数和一个目标值,返回和为目标值的集合(集合中的元素可重复)
    位运算:获取集合的子集
    Linux每次开机都要source profile的解决办法
    mysql数据导入导出
    linux下nginx编译安装(抄别人的,方便查看)
  • 原文地址:https://www.cnblogs.com/crwy/p/7444009.html
Copyright © 2011-2022 走看看