zoukankan html css js c++ java

爬取汽车之家

依赖
简单爬取汽车之家新闻页首页
爬取新闻页前一百页
多线程爬取汽车之家新闻页前100页
线程池爬取汽车之家新闻页前100页
进程池爬取汽车之家新闻页前100页
混爬汽车之家好多页

依赖

爬取汽车之家用到了Python的两个库：

requests：模拟浏览器发送请求
BeautifulSoup4：解析爬取的数据

这两个库都需要我们手动下载：

pip install requests
pip install BeautifulSoup4

简单爬取汽车之家新闻页首页

import os
import requests
from bs4 import BeautifulSoup

base_dir = os.path.dirname(__file__)


def spider():
    '''基础版爬取汽车之家新闻页'''
    response = requests.get(url='https://www.autohome.com.cn/news/')
    # print(response)  # 状态码
    # print(response.status_code)  # 状态码
    # print(response.headers)  # 响应头
    # print(response.text)      # 文本内容为中文内容为乱码，可以查看charset=gb2312
    response.encoding = 'gbk'  # 解决乱码
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
    # print(result)
    li_list = result.find_all(name='li')
    # print(li_list[0])
    for item in li_list:
        # 取标题
        title_tag = item.find(name='h3')
        if not title_tag:
            continue
        # print(title, title.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
        title = title_tag.text
        # 取简介
        introduction = item.find(name='p').text
        # print(introduction)  # [汽车之家 新车官图]  日前，为纪念奥迪R8 V10车型诞生10周年，奥迪官方发布了R8 V10 Decennium（十年）特别版车型的官图。新车基...
        url = 'https:' + item.find(name='a').get('href')
        # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
        img = 'https:' + item.find(name='img').get('src')
        # print(img)  # 爬取的是图片的链接，如果想要下载到本地，还需要再次向该链接发送请求，写入文件
        img_content = requests.get(url=img)
        img_name = img.rsplit('/', 1)[-1]
        file_path = os.path.join(base_dir, 'img', img_name)
        with open(file_path, 'wb') as f:
            f.write(img_content.content)


if __name__ == '__main__':
    spider()

爬取新闻页前一百页

import os
import time
import requests
from bs4 import BeautifulSoup

base_dir = os.path.dirname(__file__)


def spider(page):
    '''基础版爬取汽车之家新闻页'''
    response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
    # print(response)  # 状态码
    # print(response.status_code)  # 状态码
    # print(response.headers)  # 响应头
    # print(response.text)      # 文本内容为中文内容为乱码，可以查看charset=gb2312
    response.encoding = 'gbk'  # 解决乱码
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
    # print(result)
    li_list = result.find_all(name='li')
    # print(li_list[0])
    for item in li_list:
        # 取标题
        title_tag = item.find(name='h3')
        if not title_tag:
            continue
        print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
        title = title_tag.text
        # 取简介
        introduction = item.find(name='p').text
        # print(introduction)  # [汽车之家 新车官图]  日前，为纪念奥迪R8 V10车型诞生10周年，奥迪官方发布了R8 V10 Decennium（十年）特别版车型的官图。新车基...
        url = 'https:' + item.find(name='a').get('href')
        # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
        img = 'https:' + item.find(name='img').get('src')
        # print(img)  # 爬取的是图片的链接，如果想要下载到本地，还需要再次向该链接发送请求，写入文件
        # img_content = requests.get(url=img)
        # img_name = img.rsplit('/', 1)[-1]
        # file_path = os.path.join(base_dir, 'img', img_name)
        # with open(file_path, 'wb') as f:
        #     f.write(img_content.content)


if __name__ == '__main__':
    start_time = time.time()
    for i in range(1, 101):
        spider(i)
    print('顺序爬取100页共耗时', time.time() - start_time)    # 99.59376955032349

多线程爬取汽车之家新闻页前100页

import os
import time
import requests
from threading import Thread
from bs4 import BeautifulSoup

base_dir = os.path.dirname(__file__)


def spider(page):
    '''基础版爬取汽车之家新闻页'''
    response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
    # print(response)  # 状态码
    # print(response.status_code)  # 状态码
    # print(response.headers)  # 响应头
    # print(response.text)      # 文本内容为中文内容为乱码，可以查看charset=gb2312
    response.encoding = 'gbk'  # 解决乱码
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
    # print(result)
    li_list = result.find_all(name='li')
    # print(li_list[0])
    for item in li_list:
        # 取标题
        title_tag = item.find(name='h3')
        if not title_tag:
            continue
        print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
        title = title_tag.text
        # 取简介
        introduction = item.find(name='p').text
        # print(introduction)  # [汽车之家 新车官图]  日前，为纪念奥迪R8 V10车型诞生10周年，奥迪官方发布了R8 V10 Decennium（十年）特别版车型的官图。新车基...
        url = 'https:' + item.find(name='a').get('href')
        # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
        img = 'https:' + item.find(name='img').get('src')
        # print(img)  # 爬取的是图片的链接，如果想要下载到本地，还需要再次向该链接发送请求，写入文件
        # img_content = requests.get(url=img)
        # img_name = img.rsplit('/', 1)[-1]
        # file_path = os.path.join(base_dir, 'img', img_name)
        # with open(file_path, 'wb') as f:
        #     f.write(img_content.content)


if __name__ == '__main__':
    # spider(1)
    start_time = time.time()
    for i in range(1, 101):
        t = Thread(target=spider, args=(i, ))
        t.start()
    print('多线程爬取100页共耗时', time.time() - start_time)  # 0.17073273658752441

线程池爬取汽车之家新闻页前100页

import os
import time
import requests
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from bs4 import BeautifulSoup

base_dir = os.path.dirname(__file__)


def spider(page):
    '''基础版爬取汽车之家新闻页'''
    response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
    # print(response)  # 状态码
    # print(response.status_code)  # 状态码
    # print(response.headers)  # 响应头
    # print(response.text)      # 文本内容为中文内容为乱码，可以查看charset=gb2312
    response.encoding = 'gbk'  # 解决乱码
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
    # print(result)
    li_list = result.find_all(name='li')
    # print(li_list[0])
    for item in li_list:
        # 取标题
        title_tag = item.find(name='h3')
        if not title_tag:
            continue
        print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
        title = title_tag.text
        # 取简介
        introduction = item.find(name='p').text
        # print(introduction)  # [汽车之家 新车官图]  日前，为纪念奥迪R8 V10车型诞生10周年，奥迪官方发布了R8 V10 Decennium（十年）特别版车型的官图。新车基...
        url = 'https:' + item.find(name='a').get('href')
        # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
        img = 'https:' + item.find(name='img').get('src')
        # print(img)  # 爬取的是图片的链接，如果想要下载到本地，还需要再次向该链接发送请求，写入文件
        # img_content = requests.get(url=img)
        # img_name = img.rsplit('/', 1)[-1]
        # file_path = os.path.join(base_dir, 'img', img_name)
        # with open(file_path, 'wb') as f:
        #     f.write(img_content.content)


if __name__ == '__main__':
    start_time = time.time()
    t = ThreadPoolExecutor(cpu_count() * 5)
    for i in range(1, 101):
        t.submit(spider, i)
    t.shutdown(wait=True)
    print('线程池爬取100页共耗时', time.time() - start_time)  # 36.4789092540741

进程池爬取汽车之家新闻页前100页

import os
import time
import requests
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
from bs4 import BeautifulSoup

base_dir = os.path.dirname(__file__)


def spider(page):
    '''基础版爬取汽车之家新闻页'''
    response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
    # print(response)  # 状态码
    # print(response.status_code)  # 状态码
    # print(response.headers)  # 响应头
    # print(response.text)      # 文本内容为中文内容为乱码，可以查看charset=gb2312
    response.encoding = 'gbk'  # 解决乱码
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
    # print(result)
    li_list = result.find_all(name='li')
    # print(li_list[0])
    for item in li_list:
        # 取标题
        title_tag = item.find(name='h3')
        if not title_tag:
            continue
        print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
        title = title_tag.text
        # 取简介
        introduction = item.find(name='p').text
        # print(introduction)  # [汽车之家 新车官图]  日前，为纪念奥迪R8 V10车型诞生10周年，奥迪官方发布了R8 V10 Decennium（十年）特别版车型的官图。新车基...
        url = 'https:' + item.find(name='a').get('href')
        # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
        img = 'https:' + item.find(name='img').get('src')
        # print(img)  # 爬取的是图片的链接，如果想要下载到本地，还需要再次向该链接发送请求，写入文件
        # img_content = requests.get(url=img)
        # img_name = img.rsplit('/', 1)[-1]
        # file_path = os.path.join(base_dir, 'img', img_name)
        # with open(file_path, 'wb') as f:
        #     f.write(img_content.content)


if __name__ == '__main__':
    start_time = time.time()
    p = ProcessPoolExecutor(cpu_count() * 2)
    for i in range(1, 101):
        p.submit(spider, i)
    p.shutdown(wait=True)
    print('进程池爬取100页共耗时', time.time() - start_time)  # 32.66965293884277

进程池和线程池其实在合理的设置范围内爬取速度差别不大，甚至线程池更快一些，上例的最后打印的时间差距可以忽略不计，并且受网速影响。

混爬汽车之家好多页

import os
import time
import requests
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
from bs4 import BeautifulSoup

base_dir = os.path.dirname(__file__)


def spider(page):
    '''基础版爬取汽车之家新闻页'''
    response = requests.get(url='https://www.autohome.com.cn/%s/%s/#liststart' % (page[1], page[0]))
    # print(response)  # 状态码
    # print(response.status_code)  # 状态码
    # print(response.headers)  # 响应头
    # print(response.text)      # 文本内容为中文内容为乱码，可以查看charset=gb2312
    response.encoding = 'gbk'  # 解决乱码
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})  # 拿到所有的数据
    # print(result)
    li_list = result.find_all(name='li')
    # print(li_list[0])
    for item in li_list:
        # 取标题
        title_tag = item.find(name='h3')
        if not title_tag:
            continue
        print(title_tag, title_tag.text)  # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
        title = title_tag.text
        # 取简介
        introduction = item.find(name='p').text
        # print(introduction)  # [汽车之家 新车官图]  日前，为纪念奥迪R8 V10车型诞生10周年，奥迪官方发布了R8 V10 Decennium（十年）特别版车型的官图。新车基...
        url = 'https:' + item.find(name='a').get('href')
        # print(url)  # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
        img = 'https:' + item.find(name='img').get('src')
        # print(img)  # 爬取的是图片的链接，如果想要下载到本地，还需要再次向该链接发送请求，写入文件
        # img_content = requests.get(url=img)
        # img_name = img.rsplit('/', 1)[-1]
        # file_path = os.path.join(base_dir, 'img', img_name)
        # with open(file_path, 'wb') as f:
        #     f.write(img_content.content)


if __name__ == '__main__':
    start_time = time.time()
    p = ProcessPoolExecutor(cpu_count() * 2)
    for item in ['news', 'advice', 'drive', 'use', 'culture', 'travels', 'tech', 'tuning', 'ev']:
        for i in range(1, 101):
            p.submit(spider, (i, item),)
    p.shutdown(wait=True)
    print('共耗时', time.time() - start_time)  # 418.42672753334045，结果有点抠脚啊

查看全文

相关阅读:
/bin/bash^M: bad interpreter: No such file or dire
****LINUX命令（含GIT命令）个人总结
 创建和编辑 crontab 文件
 Linux下用于查看系统当前登录用户信息的4种方法
 linux下cat命令详解
 crontab 指定执行用户
 crontab定时运行git命令更新代码库
 ubuntu添加环境变量【原创】
ubuntu下设置环境变量的三种方法【转】
笔记三、apache搭建gitweb【转】

原文地址：https://www.cnblogs.com/xiaomage666/p/11732610.html