zoukankan      html  css  js  c++  java
  • 分析Ajax请求并抓取今日头条街拍美图图集(进程池、MongoDB、二进制流文件、正则、requests)

    流程如下:

    #1 配置好MongoDB的依赖库 

    #2 模拟搜索街拍的请求信息

    #3 通过请求返回的json返回的url地址再次爬取

    #4 爬取新的url地址,并爬取相关的图片地址

    #5 获取url地址,并将爬取数据写至MongoDB,且通过二进制流下载下来,若文件相同,则通过md5判断

    1、相关配置文件如下:

    config.py

    1 MONGO_URL = 'localhost:27017'
    2 MONGO_DB = 'toutiao'
    3 MONGO_TABLE = 'toutiao1'
    4 
    5 GROUP_START = 0
    6 GROUP_END = 19
    7 KEYWORD='街拍'

    2、爬虫代码如下:

      1 import json
      2 
      3 from urllib.parse import urlencode
      4 from hashlib import md5  #导入MD5判断
      5 
      6 import os
      7 import pymongo
      8 import requests
      9 from requests.exceptions import RequestException
     10 import re
     11 from bs4 import BeautifulSoup as bs
     12 from config import *
     13 from multiprocessing import Pool
     14 
     15 #mongodb数据库连接
     16 client=pymongo.MongoClient(MONGO_URL)
     17 db=client[MONGO_DB]
     18 
     19 #保存在mongodb上
     20 def save_to_mongo(result):
     21     if db[MONGO_TABLE].insert(result):
     22         print('存储到MongoDB成功',result)
     23         return True
     24     return False
     25 
     26 
     27 #01
     28 def get_page_index(offset,keyword):#获取索引页的json
     29     data={
     30         'offset': offset,
     31         'format': 'json',
     32         'keyword':keyword,
     33         'autoload':'true',
     34         'count':20,
     35         'cur_tab':3
     36     }
     37     url='https://www.toutiao.com/search_content/?'+urlencode(data)#将字典对象转为请求参数
     38     try:
     39         response=requests.get(url)
     40         if response.status_code==200:
     41             return response.text
     42         return None
     43     except RequestException:
     44         print('请求索引页出错')
     45         return None
     46 
     47 #02
     48 def parse_page_index(html):#获取索引页过来的json中的url地址
     49     data=json.loads(html)#json字符串转换为json对象
     50     if 'data' in data.keys():
     51         for item in data.get('data'):
     52             yield item.get('article_url')
     53 
     54 #03
     55 def get_page_detail(url):#获取详情
     56     try:
     57         response = requests.get(url)
     58         if response.status_code == 200:
     59             return response.text
     60         return None
     61     except RequestException:
     62         print('请求详情页出错',url)
     63         return None
     64 
     65 #04
     66 def parse_page_detail(html,url):
     67     soup=bs(html,'lxml')
     68     title=soup.title.string
     69 
     70     image_pattren = re.search(r'gallery:.*?parse("(.*?)"),', html, re.S) #这里一定要加r,表示不转义
     71     image_pattren = re.sub(r'\','',image_pattren.group(1))
     72     try:
     73         data=json.loads(image_pattren)
     74     except: #有些json需要修复
     75         image_pattren = "{" + re.search(r'("sub_images":[{.*?}]),"max', image_pattren).group(1) + "}"
     76         data=json.loads(image_pattren)
     77 
     78     if data and 'sub_images' in data.keys():
     79         sub_images = data.get('sub_images')
     80         images = [item.get('url') for item in sub_images]
     81         for img in images:
     82             download_img(img)
     83         return {
     84             'title':title,
     85             'url':url,
     86             'images':images
     87         }
     88 
     89 #05
     90 def download_img(url):#通过地址下载图片的二进制流
     91     print('正在下载',url)
     92     try:
     93         response=requests.get(url)
     94         if response.status_code==200:
     95             save_img(response.content)  #保存二进制流为图片
     96         return None
     97     except RequestException:
     98         print('请求图片出错',url)
     99         return None
    100 
    101 
    102 #06
    103 def save_img(content):
    104     file_path='{0}/{1}.{2}'.format('d:/123',md5(content).hexdigest(),'jpg')
    105     if not os.path.exists(file_path):
    106         with open(file_path,'wb') as f:
    107             f.write(content)
    108 
    109 
    110 
    111 def main(offset):
    112     html=get_page_index(offset,KEYWORD)
    113     urls=parse_page_index(html)
    114     for url in urls:
    115         html=get_page_detail(url)
    116         result=parse_page_detail(html,url)
    117         save_to_mongo(result)
    118 
    119 
    120 
    121 
    122 if __name__=='__main__':
    123     group=[x*20 for x in range(GROUP_START,GROUP_END+1)]
    124     pool = Pool()
    125     pool.map(main, group)
  • 相关阅读:
    HTML5中video的使用一
    sql存储过程的简单使用
    Linq to Sqlite连接
    linq to sql简单使用
    练习笔记:net,JqueryUI实现自动补全功能
    三个师妹之出题
    Docker——questions
    Docker——网络
    Docker——dockerfile
    Docker——容器数据卷
  • 原文地址:https://www.cnblogs.com/ceshixuexi/p/7956799.html
Copyright © 2011-2022 走看看