zoukankan      html  css  js  c++  java
  • python 火车票爬取代码

    1、根据搜索词下载百度图片:

    # -*- coding: utf-8 -*-
    """根据搜索词下载百度图片"""
    import re
    import sys
    import urllib
    
    import requests
    
    
    def get_onepage_urls(onepageurl):
        """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
        if not onepageurl:
            print('已到最后一页, 结束')
            return [], ''
        try:
            html = requests.get(onepageurl)
            html.encoding = 'utf-8'
            html = html.text
        except Exception as e:
            print(e)
            pic_urls = []
            fanye_url = ''
            return pic_urls, fanye_url
        pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
        fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
        fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
        return pic_urls, fanye_url
    
    
    def down_pic(pic_urls):
        """给出图片链接列表, 下载所有图片"""
        for i, pic_url in enumerate(pic_urls):
            try:
                pic = requests.get(pic_url, timeout=15)
                string = str(i + 1) + '.jpg'
                with open(string, 'wb') as f:
                    f.write(pic.content)
                    print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
            except Exception as e:
                print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
                print(e)
                continue
    
    
    if __name__ == '__main__':
        keyword = '火车票'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
        url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
        url_init = url_init_first + urllib.parse.quote(keyword, safe='/')
        all_pic_urls = []
        onepage_urls, fanye_url = get_onepage_urls(url_init)
        all_pic_urls.extend(onepage_urls)
    
        fanye_count = 0  # 累计翻页数
        while 1:
            onepage_urls, fanye_url = get_onepage_urls(fanye_url)
            fanye_count += 1
            # print('第页' % str(fanye_count))
            if fanye_url == '' and onepage_urls == []:
                break
            all_pic_urls.extend(onepage_urls)
    
        down_pic(list(set(all_pic_urls)))
    

    链接:https://blog.csdn.net/xiligey1/article/details/73321152  

    2、根据搜索词下载谷歌、必应、百度图片

    # coding:utf-8
    # 基于icrawler第三方库同时爬取google,baidu,bing图片,并对名称进行重写,数据进行分类
    # 图片存放路径为:base_dir='F:/文档/text'
    
    import logging
    import sys
    import base64
    from datetime import date
    from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler
    from icrawler import ImageDownloader
    from icrawler.builtin import GoogleImageCrawler
    from six.moves.urllib.parse import urlparse
    
    class PrefixNameDownloader(ImageDownloader):
    
        def get_filename(self, task, default_ext):
            filename = super(PrefixNameDownloader, self).get_filename(
                task, default_ext)
            return 'prefix_' + filename
    
    
    class Base64NameDownloader(ImageDownloader):
    
        def get_filename(self, task, default_ext):
            url_path = urlparse(task['file_url'])[2]
            if '.' in url_path:
                extension = url_path.split('.')[-1]
                if extension.lower() not in [
                        'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'gif', 'ppm', 'pgm'
                ]:
                    extension = default_ext
            else:
                extension = default_ext
            filename = base64.b64encode(url_path.encode()).decode()
            return '{}.{}'.format(filename, extension)
    
    def test_google(dir,keyword):
        print('启用google爬虫')
        google_crawler = GoogleImageCrawler(parser_threads=20,
                                            downloader_threads=20,
                                            downloader_cls=Base64NameDownloader,
                                            storage={'root_dir': dir},
                                            log_level = logging.INFO)
        google_crawler.crawl(keyword=keyword, offset=0, max_num=1000,min_size=(200,200), max_size=None)
    
    
    def test_bing(dir,keyword):
        keyword = keyword.replace(': flickr.com', '')
        print('启用bing爬虫',keyword)
        bing_crawler = BingImageCrawler(
                                        # parser_threads=16,
                                        downloader_cls=Base64NameDownloader,
                                        downloader_threads=16,
                                        storage={'root_dir': dir},
                                        log_level=logging.DEBUG)
        bing_crawler.crawl(keyword=keyword,offset=0, max_num=1000,min_size=None,max_size=None)
    
    def test_baidu(dir,keyword):
        keyword = keyword.replace(': flickr.com', '')
        print('启用百度爬虫',keyword)
        baidu_crawler = BaiduImageCrawler(
                                        # parser_threads=16,
                                        # downloader_threads=16,
                                        downloader_cls=Base64NameDownloader,
                                        storage={'root_dir': dir},
                                        log_level = logging.DEBUG)
        baidu_crawler.crawl(keyword=keyword, offset=0,max_num=1000,min_size=None,max_size=None)
    
    
    def main():
    ##################################################################
                keyword='火车票'
                base_dir='F:/文档/text'
                if len(sys.argv) == 1:
                    dst = 'all'
                else:
                    dst = sys.argv[1:]
                if 'all' in dst:
                    dst = ['google', 'bing', 'baidu',]
                if 'google' in dst:
                    test_google(base_dir,keyword)
                if 'bing' in dst:
                    test_bing(base_dir,keyword)
                if 'baidu' in dst:
                    test_baidu(base_dir,keyword)
    
    
    if __name__ == '__main__':
        main()
    

      

    链接:https://github.com/Crawler-y/Image_crawl-

    3、github 搜索爬虫,有许多有趣的项目。

  • 相关阅读:
    Python 存储引擎 数据类型 主键
    Python 数据库
    Python 线程池进程池 异步回调 协程 IO模型
    Python GIL锁 死锁 递归锁 event事件 信号量
    Python 进程间通信 线程
    Python 计算机发展史 多道技术 进程 守护进程 孤儿和僵尸进程 互斥锁
    Python 异常及处理 文件上传事例 UDP socketserver模块
    Python socket 粘包问题 报头
    Django基础,Day7
    Django基础,Day6
  • 原文地址:https://www.cnblogs.com/Allen-rg/p/9756960.html
Copyright © 2011-2022 走看看