zoukankan      html  css  js  c++  java
  • 分析ajax请求抓取今日头条关键字美图

      1 # 目标:抓取今日头条关键字美图
      2 # 思路:
      3 # 一、分析目标站点
      4 # 二、构造ajax请求,用requests请求到索引页的内容,正则+BeautifulSoup得到索引url
      5 # 三、对索引url请求,得到图片url与标题,下载并保存到数据库,本次使用MongDB
      6 # 四、开启循环与多进程,对多页内容遍历与抓取
      7 
      8 #问题一、为什么要构造请求
      9 #为什么要构造请求,举个例子,第一屏的内容我们看到的实际url是:
     10 # http://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1
     11 # 后面有一大串参数,这些参数就是请求的一些‘设定’,表示关键词,加载的页数,等等,是一个字典的形式,
     12 # 如果人为去传这些数据显然十分繁琐,我们需要将这字典编码成一定格式加载请求函数里面。
     13 import os
     14 from json import JSONDecodeError
     15 from multiprocessing.pool import Pool
     16 
     17 import requests
     18 from urllib.parse import urlencode
     19 import json
     20 import pymongo
     21 
     22 from bs4 import BeautifulSoup
     23 
     24 from requests.exceptions import RequestException
     25 import re
     26 from config import *
     27 
     28 client = pymongo.MongoClient(MONGO_URL)
     29 db = client[MONGO_DB]
     30 
     31 def get_index_page(offset,keyword):
     32     data = {
     33         'offset': offset,
     34         'format': 'json',
     35         'keyword': keyword,
     36         'autoload': 'true',
     37         'count': '20',
     38         'cur_tab': 1
     39     }
     40     data = urlencode(data)
     41     url ='http://www.toutiao.com/search_content/?' + data
     42     #print(url)
     43     try:
     44         response = requests.get(url)
     45         if response.status_code == 200:
     46             return response.text
     47         else:
     48             return None
     49     except RequestException:
     50         print('请求不到索引页面!')
     51         return None
     52 
     53 
     54 def parse_index_page(html):
     55 
     56     #json_obj = json.dumps(html)#将Python对象序列化为json
     57     #python_obj = json.loads(json_obj)#将json加载成Python对象
     58     data = json.loads(html)
     59     #在进行json操作之前有必要了解一下json是怎么操作的
     60     if data and 'data' in data.keys():
     61         for item in data.get('data'):
     62             yield item.get('article_url')
     63 
     64 
     65 def get_detail_page(url):
     66     try:
     67         response = requests.get(url)
     68         if response.status_code == 200:
     69             return response.text
     70         else:
     71             return None
     72     except RequestException:
     73         return None
     74 
     75 def save_to_mongo(result):
     76     if db[MONG_TABLE].insert(result):
     77         print('存储到MongoDB成功',result)
     78         return True
     79     else:
     80         return False
     81 
     82 def parse_detail_page(html,url):
     83     soup = BeautifulSoup(html,'lxml')
     84     title = soup.title.string
     85     pattern = re.compile(r'var gallery = (.*?);',re.S)
     86     result = re.findall(pattern,html)
     87     if result:
     88         images=[]
     89         for i in result:
     90             i = json.loads(i)
     91             j = i.get("sub_images")
     92             #print(j)
     93             for k in j:
     94                 k = k.get('url')
     95                 images.append(k)
     96 
     97             return{
     98                     'title':title,
     99                     'url':url,
    100                     'images':images
    101                 }
    102 
    103 def download_image(result):
    104     image_list = result.get('images')
    105     image_title  = result.get('title')
    106     print('正在下载:%s'%image_title)
    107 
    108     if image_title not in os.listdir(path ='.'):
    109         os.mkdir(image_title)
    110         os.chdir(image_title)
    111         for image in image_list:
    112             try:
    113                 response = requests.get(image)
    114                 if response.status_code == 200:
    115                     filename = image.split('/')[-1] + '.jpg'
    116                     with open(filename,'wb') as f:
    117                         f.write(response.content)
    118                         print('正在下载:%s'%image)
    119 
    120                 else:
    121                     return None
    122             except RequestException:
    123                 return None
    124         os.chdir(os.pardir)#返回上一级目录
    125 
    126 
    127 def main(offset):
    128 
    129     html = get_index_page(offset,KEYWORDS)
    130     for url in parse_index_page(html):
    131         #print(url)
    132         html = get_detail_page(url)
    133         if html:
    134             result = parse_detail_page(html,url)
    135             if result:
    136                 #print(result)
    137                 #save_to_mongo(result)
    138                 download_image(result)
    139 
    140 
    141 
    142 if __name__ == '__main__':
    143 
    144     groups = [i*20 for i in range(GROUP_START,GROUP_END + 1)]
    145     pool = Pool()
    146     pool.map(main,groups)
      1 #对比老司机所写
      2 import json
      3 import os
      4 from urllib.parse import urlencode
      5 import pymongo
      6 import requests
      7 from bs4 import BeautifulSoup
      8 from requests.exceptions import ConnectionError
      9 import re
     10 from multiprocessing import Pool
     11 from hashlib import md5
     12 from json.decoder import JSONDecodeError
     13 from config import *
     14 
     15 client = pymongo.MongoClient(MONGO_URL, connect=False)
     16 db = client[MONGO_DB]
     17 
     18 
     19 def get_page_index(offset, keyword):
     20     data = {
     21         'autoload': 'true',
     22         'count': 20,
     23         'cur_tab': 3,
     24         'format': 'json',
     25         'keyword': keyword,
     26         'offset': offset,
     27     }
     28     params = urlencode(data)
     29     base = 'http://www.toutiao.com/search_content/'
     30     url = base + '?' + params
     31     try:
     32         response = requests.get(url)
     33         if response.status_code == 200:
     34             return response.text
     35         return None
     36     except ConnectionError:
     37         print('Error occurred')
     38         return None
     39 
     40 
     41 def download_image(url):
     42     print('Downloading', url)
     43     try:
     44         response = requests.get(url)
     45         if response.status_code == 200:
     46             save_image(response.content)
     47         return None
     48     except ConnectionError:
     49         return None
     50 
     51 
     52 def save_image(content):
     53     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
     54     print(file_path)
     55     if not os.path.exists(file_path):
     56         with open(file_path, 'wb') as f:
     57             f.write(content)
     58             f.close()
     59 
     60 
     61 def parse_page_index(text):
     62     try:
     63         data = json.loads(text)
     64         if data and 'data' in data.keys():
     65             for item in data.get('data'):
     66                 yield item.get('article_url')
     67     except JSONDecodeError:
     68         pass
     69 
     70 
     71 def get_page_detail(url):
     72     try:
     73         response = requests.get(url)
     74         if response.status_code == 200:
     75             return response.text
     76         return None
     77     except ConnectionError:
     78         print('Error occurred')
     79         return None
     80 
     81 
     82 def parse_page_detail(html, url):
     83     soup = BeautifulSoup(html, 'lxml')
     84     result = soup.select('title')
     85     title = result[0].get_text() if result else ''
     86     images_pattern = re.compile('var gallery = (.*?);', re.S)
     87     result = re.search(images_pattern, html)
     88     if result:
     89         data = json.loads(result.group(1))
     90         if data and 'sub_images' in data.keys():
     91             sub_images = data.get('sub_images')
     92             images = [item.get('url') for item in sub_images]
     93             for image in images: download_image(image)
     94             return {
     95                 'title': title,
     96                 'url': url,
     97                 'images': images
     98             }
     99 
    100 
    101 def save_to_mongo(result):
    102     if db[MONGO_TABLE].insert(result):
    103         print('Successfully Saved to Mongo', result)
    104         return True
    105     return False
    106 
    107 
    108 def main(offset):
    109     text = get_page_index(offset, KEYWORD)
    110     urls = parse_page_index(text)
    111     for url in urls:
    112         html = get_page_detail(url)
    113         result = parse_page_detail(html, url)
    114         if result: save_to_mongo(result)
    115 
    116 
    117 pool = Pool()
    118 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    119 pool.map(main, groups)
    120 pool.close()
    121 pool.join()
  • 相关阅读:
    历经7年双11实战,阿里巴巴是如何定义云原生混部调度优先级及服务质量的?
    阿里园区的这个“格子间” 成为企业高效协同新利器
    神马是代码简单的cmd模式,这就是!
    什么是javascript的中间件?
    centos 修改主机名
    centos6.5 ssh免密码登陆
    mysql 镜像
    Spring Boot 负载均衡之外置session状态保存
    解决shiro和quartz2 版本冲突问题
    centos6.5 修改java环境变量
  • 原文地址:https://www.cnblogs.com/themost/p/6894411.html
Copyright © 2011-2022 走看看