zoukankan      html  css  js  c++  java
  • 分析Ajax抓取今日头条街拍美图

    spider.py

      1 # -*- coding:utf-8 -*-
      2 from urllib import urlencode
      3 import requests
      4 from requests.exceptions import RequestException
      5 import json
      6 import re
      7 import os
      8 from hashlib import md5
      9 from bs4 import BeautifulSoup
     10 import pymongo
     11 from multiprocessing import Pool
     12 from json.decoder import JSONDecoder
     13 from config import *
     14 
     15 client = pymongo.MongoClient(MONGO_URL, connect=False)
     16 db = client[MONGO_DB]
     17 
     18 def get_page_index(offset,keyword):
     19     data = {
     20         'offset': offset,
     21         'format': 'json',
     22         'keyword': keyword,
     23         'autoload': 'true',
     24         'count': '20',
     25         'cur_tab': 3
     26     }
     27     url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
     28     try:
     29         response = requests.get(url)
     30         if response.status_code == 200:
     31             return response.text
     32         return None
     33     except RequestException:
     34         print u'请求索引页失败', url
     35         return None
     36 
     37 def parse_page_index(html):
     38     data = json.loads(html)
     39     if data and 'data' in data.keys():
     40         for item in data.get('data'):
     41             yield item.get('article_url')
     42 
     43 def get_page_detail(url):
     44     try:
     45         response = requests.get(url)
     46         if response.status_code == 200:
     47             return response.text
     48         return None
     49     except RequestException:
     50         print u'请求详情页失败', url
     51         return None
     52 
     53 def parse_page_detail(html, url):
     54     soup = BeautifulSoup(html, 'lxml')
     55     title = soup.select('title')[0].get_text()
     56     print(title)
     57     images_pattern = re.compile('gallery: (.*?),
    ', re.S)
     58     result = re.search(images_pattern, html)
     59     if result:
     60         data = json.loads(result.group(1))
     61         if data and 'sub_images' in data.keys():
     62             sub_images = data.get('sub_images')
     63             images = [item.get('url') for item in sub_images]
     64             for image in images: download_image(image)
     65             return {
     66                 'title': title,
     67                 'url': url,
     68                 'images': images
     69             }
     70 
     71 def save_to_mongo(result):
     72     if db[MONGO_TABLE].insert(result):
     73         print u'存储到MongoDB成功', result
     74         return True
     75     return False
     76 
     77 def download_image(url):
     78     print u'正在下载', url
     79     try:
     80         response = requests.get(url)
     81         if response.status_code == 200:
     82             save_image(response.content)
     83         return None
     84     except RequestException:
     85         print u'请求图片失败', url
     86         return None
     87 
     88 def save_image(content):
     89     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
     90     if not os.path.exists(file_path):
     91         with open(file_path, 'wb') as f:
     92             f.write(content)
     93             f.close()
     94 
     95 def main(offset):
     96     html = get_page_index(offset, KEYWORD)
     97     for url in parse_page_index(html):
     98         html = get_page_detail(url)
     99         if html:
    100             result = parse_page_detail(html, url)
    101             if result: save_to_mongo(result)
    102 
    103 if __name__ == '__main__':
    104     groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
    105     pool = Pool()
    106     pool.map(main, groups)
    View Code

    config.py

    1 # -*- coding:utf-8 -*-
    2 MONGO_URL = 'localhost'
    3 MONGO_DB = 'toutiao'
    4 MONGO_TABLE = 'toutiao'
    5 
    6 GROUP_START = 0
    7 GROUP_END = 20
    8 
    9 KEYWORD = '街拍'
    View Code
  • 相关阅读:
    JavaScript原型链详解
    Js作用域与闭包
    tjs 在嵌套函数中this关键字引用head对象
    NodeJS stream 一:Buffer
    NodeJS Stream 二:什么是 Stream
    NodeJS Stream 三:readable
    NodeJS Stream 四:Writable
    VSS又一次出错了,神出鬼没的
    VS2005的关于母版页嵌套的一个小技巧
    【转】SQL Server数据库开发的二十一条军规
  • 原文地址:https://www.cnblogs.com/stonelovy/p/7644651.html
Copyright © 2011-2022 走看看