zoukankan      html  css  js  c++  java
  • 爬取汽车之家

    依赖

    爬取汽车之家用到了Python的两个库:

    • requests:模拟浏览器发送请求
    • BeautifulSoup4:解析爬取的数据

    这两个库都需要我们手动下载:

    pip install requests
    pip install BeautifulSoup4

    简单爬取汽车之家新闻页首页

    import os
    import requests
    from bs4 import BeautifulSoup
    
    base_dir = os.path.dirname(__file__)
    
    
    def spider():
        '''基础版爬取汽车之家新闻页'''
        response = requests.get(url='https://www.autohome.com.cn/news/')
        # print(response)  # 状态码
        # print(response.status_code)  # 状态码
        # print(response.headers)  # 响应头
        # print(response.text)      # 文本内容为中文内容为乱码,可以查看charset=gb2312
        response.encoding = 'gbk'  # 解决乱码
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
        # print(result)
        li_list = result.find_all(name='li')
        # print(li_list[0])
        for item in li_list:
            # 取标题
            title_tag = item.find(name='h3')
            if not title_tag:
                continue
            # print(title, title.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
            title = title_tag.text
            # 取简介
            introduction = item.find(name='p').text
            # print(introduction)  # [汽车之家 新车官图]  日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
            url = 'https:' + item.find(name='a').get('href')
            # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
            img = 'https:' + item.find(name='img').get('src')
            # print(img)  # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
            img_content = requests.get(url=img)
            img_name = img.rsplit('/', 1)[-1]
            file_path = os.path.join(base_dir, 'img', img_name)
            with open(file_path, 'wb') as f:
                f.write(img_content.content)
    
    
    if __name__ == '__main__':
        spider()
    

    爬取新闻页前一百页

    import os
    import time
    import requests
    from bs4 import BeautifulSoup
    
    base_dir = os.path.dirname(__file__)
    
    
    def spider(page):
        '''基础版爬取汽车之家新闻页'''
        response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
        # print(response)  # 状态码
        # print(response.status_code)  # 状态码
        # print(response.headers)  # 响应头
        # print(response.text)      # 文本内容为中文内容为乱码,可以查看charset=gb2312
        response.encoding = 'gbk'  # 解决乱码
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
        # print(result)
        li_list = result.find_all(name='li')
        # print(li_list[0])
        for item in li_list:
            # 取标题
            title_tag = item.find(name='h3')
            if not title_tag:
                continue
            print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
            title = title_tag.text
            # 取简介
            introduction = item.find(name='p').text
            # print(introduction)  # [汽车之家 新车官图]  日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
            url = 'https:' + item.find(name='a').get('href')
            # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
            img = 'https:' + item.find(name='img').get('src')
            # print(img)  # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
            # img_content = requests.get(url=img)
            # img_name = img.rsplit('/', 1)[-1]
            # file_path = os.path.join(base_dir, 'img', img_name)
            # with open(file_path, 'wb') as f:
            #     f.write(img_content.content)
    
    
    if __name__ == '__main__':
        start_time = time.time()
        for i in range(1, 101):
            spider(i)
        print('顺序爬取100页共耗时', time.time() - start_time)    # 99.59376955032349
    

    多线程爬取汽车之家新闻页前100页

    import os
    import time
    import requests
    from threading import Thread
    from bs4 import BeautifulSoup
    
    base_dir = os.path.dirname(__file__)
    
    
    def spider(page):
        '''基础版爬取汽车之家新闻页'''
        response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
        # print(response)  # 状态码
        # print(response.status_code)  # 状态码
        # print(response.headers)  # 响应头
        # print(response.text)      # 文本内容为中文内容为乱码,可以查看charset=gb2312
        response.encoding = 'gbk'  # 解决乱码
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
        # print(result)
        li_list = result.find_all(name='li')
        # print(li_list[0])
        for item in li_list:
            # 取标题
            title_tag = item.find(name='h3')
            if not title_tag:
                continue
            print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
            title = title_tag.text
            # 取简介
            introduction = item.find(name='p').text
            # print(introduction)  # [汽车之家 新车官图]  日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
            url = 'https:' + item.find(name='a').get('href')
            # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
            img = 'https:' + item.find(name='img').get('src')
            # print(img)  # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
            # img_content = requests.get(url=img)
            # img_name = img.rsplit('/', 1)[-1]
            # file_path = os.path.join(base_dir, 'img', img_name)
            # with open(file_path, 'wb') as f:
            #     f.write(img_content.content)
    
    
    if __name__ == '__main__':
        # spider(1)
        start_time = time.time()
        for i in range(1, 101):
            t = Thread(target=spider, args=(i, ))
            t.start()
        print('多线程爬取100页共耗时', time.time() - start_time)  # 0.17073273658752441

    线程池爬取汽车之家新闻页前100页

    import os
    import time
    import requests
    from concurrent.futures import ThreadPoolExecutor
    from multiprocessing import cpu_count
    from bs4 import BeautifulSoup
    
    base_dir = os.path.dirname(__file__)
    
    
    def spider(page):
        '''基础版爬取汽车之家新闻页'''
        response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
        # print(response)  # 状态码
        # print(response.status_code)  # 状态码
        # print(response.headers)  # 响应头
        # print(response.text)      # 文本内容为中文内容为乱码,可以查看charset=gb2312
        response.encoding = 'gbk'  # 解决乱码
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
        # print(result)
        li_list = result.find_all(name='li')
        # print(li_list[0])
        for item in li_list:
            # 取标题
            title_tag = item.find(name='h3')
            if not title_tag:
                continue
            print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
            title = title_tag.text
            # 取简介
            introduction = item.find(name='p').text
            # print(introduction)  # [汽车之家 新车官图]  日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
            url = 'https:' + item.find(name='a').get('href')
            # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
            img = 'https:' + item.find(name='img').get('src')
            # print(img)  # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
            # img_content = requests.get(url=img)
            # img_name = img.rsplit('/', 1)[-1]
            # file_path = os.path.join(base_dir, 'img', img_name)
            # with open(file_path, 'wb') as f:
            #     f.write(img_content.content)
    
    
    if __name__ == '__main__':
        start_time = time.time()
        t = ThreadPoolExecutor(cpu_count() * 5)
        for i in range(1, 101):
            t.submit(spider, i)
        t.shutdown(wait=True)
        print('线程池爬取100页共耗时', time.time() - start_time)  # 36.4789092540741
    

    进程池爬取汽车之家新闻页前100页

    import os
    import time
    import requests
    from concurrent.futures import ProcessPoolExecutor
    from multiprocessing import cpu_count
    from bs4 import BeautifulSoup
    
    base_dir = os.path.dirname(__file__)
    
    
    def spider(page):
        '''基础版爬取汽车之家新闻页'''
        response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
        # print(response)  # 状态码
        # print(response.status_code)  # 状态码
        # print(response.headers)  # 响应头
        # print(response.text)      # 文本内容为中文内容为乱码,可以查看charset=gb2312
        response.encoding = 'gbk'  # 解决乱码
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
        # print(result)
        li_list = result.find_all(name='li')
        # print(li_list[0])
        for item in li_list:
            # 取标题
            title_tag = item.find(name='h3')
            if not title_tag:
                continue
            print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
            title = title_tag.text
            # 取简介
            introduction = item.find(name='p').text
            # print(introduction)  # [汽车之家 新车官图]  日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
            url = 'https:' + item.find(name='a').get('href')
            # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
            img = 'https:' + item.find(name='img').get('src')
            # print(img)  # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
            # img_content = requests.get(url=img)
            # img_name = img.rsplit('/', 1)[-1]
            # file_path = os.path.join(base_dir, 'img', img_name)
            # with open(file_path, 'wb') as f:
            #     f.write(img_content.content)
    
    
    if __name__ == '__main__':
        start_time = time.time()
        p = ProcessPoolExecutor(cpu_count() * 2)
        for i in range(1, 101):
            p.submit(spider, i)
        p.shutdown(wait=True)
        print('进程池爬取100页共耗时', time.time() - start_time)  # 32.66965293884277

    进程池和线程池其实在合理的设置范围内爬取速度差别不大,甚至线程池更快一些,上例的最后打印的时间差距可以忽略不计,并且受网速影响。

    混爬汽车之家好多页

    import os
    import time
    import requests
    from concurrent.futures import ProcessPoolExecutor
    from multiprocessing import cpu_count
    from bs4 import BeautifulSoup
    
    base_dir = os.path.dirname(__file__)
    
    
    def spider(page):
        '''基础版爬取汽车之家新闻页'''
        response = requests.get(url='https://www.autohome.com.cn/%s/%s/#liststart' % (page[1], page[0]))
        # print(response)  # 状态码
        # print(response.status_code)  # 状态码
        # print(response.headers)  # 响应头
        # print(response.text)      # 文本内容为中文内容为乱码,可以查看charset=gb2312
        response.encoding = 'gbk'  # 解决乱码
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
        # print(result)
        li_list = result.find_all(name='li')
        # print(li_list[0])
        for item in li_list:
            # 取标题
            title_tag = item.find(name='h3')
            if not title_tag:
                continue
            print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
            title = title_tag.text
            # 取简介
            introduction = item.find(name='p').text
            # print(introduction)  # [汽车之家 新车官图]  日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
            url = 'https:' + item.find(name='a').get('href')
            # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
            img = 'https:' + item.find(name='img').get('src')
            # print(img)  # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
            # img_content = requests.get(url=img)
            # img_name = img.rsplit('/', 1)[-1]
            # file_path = os.path.join(base_dir, 'img', img_name)
            # with open(file_path, 'wb') as f:
            #     f.write(img_content.content)
    
    
    if __name__ == '__main__':
        start_time = time.time()
        p = ProcessPoolExecutor(cpu_count() * 2)
        for item in ['news', 'advice', 'drive', 'use', 'culture', 'travels', 'tech', 'tuning', 'ev']:
            for i in range(1, 101):
                p.submit(spider, (i, item),)
        p.shutdown(wait=True)
        print('共耗时', time.time() - start_time)  # 418.42672753334045,结果有点抠脚啊

  • 相关阅读:
    awk 使用shell 变量
    设计模式之 外观(门面)模式 Facade
    设计模式之 抽象工厂模式
    python 第一课
    Visual Basic 图片连接地址添加
    smarty 不同模板 缓存时间
    PHP 传参过滤
    Nginx 0.7.x + PHP 5.2.10(FastCGI)搭建支持高并发量的Web服务器
    linux vi 编辑命令
    PHP 命令模式 执行文件 并传递参数
  • 原文地址:https://www.cnblogs.com/xiaomage666/p/11732610.html
Copyright © 2011-2022 走看看