zoukankan      html  css  js  c++  java
  • python 基于ajax请求的妹子爬虫

    ajax 爬虫 网址http://www.doyo.cn/tu

    下拉网页

    代码:

    # coding=utf-8
    import random
    import time
    from multiprocessing import Pool
    import requests
    from requests.exceptions import RequestException
    import json
    import os
    import re
    
    #    <img src="http://s2.doyo.cn/img/5a/0a/684c9e9e780a3000002e.jpg" id="big_picture">
    pattern = re.compile('<img src="(.*?)" id="big_picture" />', re.S)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/65.0.3325.146 Safari/537.36 "
    }
    data_url = 'http://www.doyo.cn/tu/getdata'
    root_url = 'http://www.doyo.cn/picture/{number}/{index}/view'
    try:
        os.mkdir('mm')
        os.chdir('mm')
    except:
        pass
    parent_path = os.getcwd()
    j = open('picinfo.json', 'w', encoding='utf-8')
    
    
    def get_one_page(start):
        params = {
            "cate": "all",
            "tag": "all",
            "sort": "updatetime",
            "start": "0",
            "count": start
        }
        try:
            response = requests.get(url=data_url, params=params)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print("NetWork Error")
    
    
    def get_one_data(html):
        data = json.loads(html)
        if data and 'info' in data.keys():
            for info in data.get('info'):
                yield {
                    'title': info[0],
                    'count': info[1],
                    'pic': info[6],
                    'number': info[7]
                }
    
    
    def write_to_file(res):
        os.chdir(parent_path)
        j.write(json.dumps(res, ensure_ascii=False) + '
    ')
    
    
    def get_img_urllist(res):
        # root_url = 'http://www.doyo.cn/picture/{number}/{index}/view'
        for index in range(1, int(res.get('count')) + 1):
            yield root_url.format(number=res.get('number'), index=index)
    
    
    def save_img(res):
        os.chdir(parent_path)
        try:
            os.mkdir(res.get('title'))
            os.chdir(res.get('title'))
        except:
            pass
        url_list = get_img_urllist(res)
        for imag_url in url_list:
            url = get_img_url(imag_url)
            print(url)
            try:
                filename = str(random.random())
                time.sleep(0.1)
                response = requests.get(url, headers=headers)
                with open(filename + '.jpg', 'wb') as f:
                    f.write(response.content)
            except:
                pass
    
    
    def get_html(url):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            print("connection error")
    
    
    def get_img_url(imag_url):
        html = get_html(imag_url)
        res = re.search(pattern=pattern, string=html)
        if res:
            return res.group(1)
    
    
    def main(start):
        html = get_one_page(start)
        res = get_one_data(html)
        for each in res:
            write_to_file(each)
            save_img(each)
    
    
    if __name__ == '__main__':
        start = [n * 25 for n in range(0, 1)]
        for i in start:
            main(i)
        j.close()

     

  • 相关阅读:
    Linux系统格式化磁盘+挂载分区
    ELK日志处理
    memcached安装+绑定访问ip
    Redis安装+密码认证
    linux安全
    Mysql备份与恢复
    MySql binlog
    gitlab自动备份
    Rabbitmq安装与配置
    Hadoop实战-Flume之Source interceptor(十一)(2017-05-16 22:40)
  • 原文地址:https://www.cnblogs.com/francischeng/p/9433505.html
Copyright © 2011-2022 走看看