zoukankan      html  css  js  c++  java
  • python 火车票爬取代码

    1、根据搜索词下载百度图片:

    # -*- coding: utf-8 -*-
    """根据搜索词下载百度图片"""
    import re
    import sys
    import urllib
    
    import requests
    
    
    def get_onepage_urls(onepageurl):
        """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
        if not onepageurl:
            print('已到最后一页, 结束')
            return [], ''
        try:
            html = requests.get(onepageurl)
            html.encoding = 'utf-8'
            html = html.text
        except Exception as e:
            print(e)
            pic_urls = []
            fanye_url = ''
            return pic_urls, fanye_url
        pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
        fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
        fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
        return pic_urls, fanye_url
    
    
    def down_pic(pic_urls):
        """给出图片链接列表, 下载所有图片"""
        for i, pic_url in enumerate(pic_urls):
            try:
                pic = requests.get(pic_url, timeout=15)
                string = str(i + 1) + '.jpg'
                with open(string, 'wb') as f:
                    f.write(pic.content)
                    print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
            except Exception as e:
                print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
                print(e)
                continue
    
    
    if __name__ == '__main__':
        keyword = '火车票'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
        url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
        url_init = url_init_first + urllib.parse.quote(keyword, safe='/')
        all_pic_urls = []
        onepage_urls, fanye_url = get_onepage_urls(url_init)
        all_pic_urls.extend(onepage_urls)
    
        fanye_count = 0  # 累计翻页数
        while 1:
            onepage_urls, fanye_url = get_onepage_urls(fanye_url)
            fanye_count += 1
            # print('第页' % str(fanye_count))
            if fanye_url == '' and onepage_urls == []:
                break
            all_pic_urls.extend(onepage_urls)
    
        down_pic(list(set(all_pic_urls)))
    

    链接:https://blog.csdn.net/xiligey1/article/details/73321152  

    2、根据搜索词下载谷歌、必应、百度图片

    # coding:utf-8
    # 基于icrawler第三方库同时爬取google,baidu,bing图片,并对名称进行重写,数据进行分类
    # 图片存放路径为:base_dir='F:/文档/text'
    
    import logging
    import sys
    import base64
    from datetime import date
    from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler
    from icrawler import ImageDownloader
    from icrawler.builtin import GoogleImageCrawler
    from six.moves.urllib.parse import urlparse
    
    class PrefixNameDownloader(ImageDownloader):
    
        def get_filename(self, task, default_ext):
            filename = super(PrefixNameDownloader, self).get_filename(
                task, default_ext)
            return 'prefix_' + filename
    
    
    class Base64NameDownloader(ImageDownloader):
    
        def get_filename(self, task, default_ext):
            url_path = urlparse(task['file_url'])[2]
            if '.' in url_path:
                extension = url_path.split('.')[-1]
                if extension.lower() not in [
                        'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'gif', 'ppm', 'pgm'
                ]:
                    extension = default_ext
            else:
                extension = default_ext
            filename = base64.b64encode(url_path.encode()).decode()
            return '{}.{}'.format(filename, extension)
    
    def test_google(dir,keyword):
        print('启用google爬虫')
        google_crawler = GoogleImageCrawler(parser_threads=20,
                                            downloader_threads=20,
                                            downloader_cls=Base64NameDownloader,
                                            storage={'root_dir': dir},
                                            log_level = logging.INFO)
        google_crawler.crawl(keyword=keyword, offset=0, max_num=1000,min_size=(200,200), max_size=None)
    
    
    def test_bing(dir,keyword):
        keyword = keyword.replace(': flickr.com', '')
        print('启用bing爬虫',keyword)
        bing_crawler = BingImageCrawler(
                                        # parser_threads=16,
                                        downloader_cls=Base64NameDownloader,
                                        downloader_threads=16,
                                        storage={'root_dir': dir},
                                        log_level=logging.DEBUG)
        bing_crawler.crawl(keyword=keyword,offset=0, max_num=1000,min_size=None,max_size=None)
    
    def test_baidu(dir,keyword):
        keyword = keyword.replace(': flickr.com', '')
        print('启用百度爬虫',keyword)
        baidu_crawler = BaiduImageCrawler(
                                        # parser_threads=16,
                                        # downloader_threads=16,
                                        downloader_cls=Base64NameDownloader,
                                        storage={'root_dir': dir},
                                        log_level = logging.DEBUG)
        baidu_crawler.crawl(keyword=keyword, offset=0,max_num=1000,min_size=None,max_size=None)
    
    
    def main():
    ##################################################################
                keyword='火车票'
                base_dir='F:/文档/text'
                if len(sys.argv) == 1:
                    dst = 'all'
                else:
                    dst = sys.argv[1:]
                if 'all' in dst:
                    dst = ['google', 'bing', 'baidu',]
                if 'google' in dst:
                    test_google(base_dir,keyword)
                if 'bing' in dst:
                    test_bing(base_dir,keyword)
                if 'baidu' in dst:
                    test_baidu(base_dir,keyword)
    
    
    if __name__ == '__main__':
        main()
    

      

    链接:https://github.com/Crawler-y/Image_crawl-

    3、github 搜索爬虫,有许多有趣的项目。

  • 相关阅读:
    Educational Codeforces Round 86 (Rated for Div. 2) D. Multiple Testcases
    Educational Codeforces Round 86 (Rated for Div. 2) C. Yet Another Counting Problem
    HDU
    HDU
    HDU
    HDU
    Good Bye 2019 C. Make Good (异或的使用)
    Educational Codeforces Round 78 (Rated for Div. 2) C. Berry Jam
    codeforces 909C. Python Indentation
    codeforces1054 C. Candies Distribution
  • 原文地址:https://www.cnblogs.com/Allen-rg/p/9756960.html
Copyright © 2011-2022 走看看