zoukankan      html  css  js  c++  java
  • python 基于ajax请求的妹子爬虫

    ajax 爬虫 网址http://www.doyo.cn/tu

    下拉网页

    代码:

    # coding=utf-8
    import random
    import time
    from multiprocessing import Pool
    import requests
    from requests.exceptions import RequestException
    import json
    import os
    import re
    
    #    <img src="http://s2.doyo.cn/img/5a/0a/684c9e9e780a3000002e.jpg" id="big_picture">
    pattern = re.compile('<img src="(.*?)" id="big_picture" />', re.S)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/65.0.3325.146 Safari/537.36 "
    }
    data_url = 'http://www.doyo.cn/tu/getdata'
    root_url = 'http://www.doyo.cn/picture/{number}/{index}/view'
    try:
        os.mkdir('mm')
        os.chdir('mm')
    except:
        pass
    parent_path = os.getcwd()
    j = open('picinfo.json', 'w', encoding='utf-8')
    
    
    def get_one_page(start):
        params = {
            "cate": "all",
            "tag": "all",
            "sort": "updatetime",
            "start": "0",
            "count": start
        }
        try:
            response = requests.get(url=data_url, params=params)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print("NetWork Error")
    
    
    def get_one_data(html):
        data = json.loads(html)
        if data and 'info' in data.keys():
            for info in data.get('info'):
                yield {
                    'title': info[0],
                    'count': info[1],
                    'pic': info[6],
                    'number': info[7]
                }
    
    
    def write_to_file(res):
        os.chdir(parent_path)
        j.write(json.dumps(res, ensure_ascii=False) + '
    ')
    
    
    def get_img_urllist(res):
        # root_url = 'http://www.doyo.cn/picture/{number}/{index}/view'
        for index in range(1, int(res.get('count')) + 1):
            yield root_url.format(number=res.get('number'), index=index)
    
    
    def save_img(res):
        os.chdir(parent_path)
        try:
            os.mkdir(res.get('title'))
            os.chdir(res.get('title'))
        except:
            pass
        url_list = get_img_urllist(res)
        for imag_url in url_list:
            url = get_img_url(imag_url)
            print(url)
            try:
                filename = str(random.random())
                time.sleep(0.1)
                response = requests.get(url, headers=headers)
                with open(filename + '.jpg', 'wb') as f:
                    f.write(response.content)
            except:
                pass
    
    
    def get_html(url):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            print("connection error")
    
    
    def get_img_url(imag_url):
        html = get_html(imag_url)
        res = re.search(pattern=pattern, string=html)
        if res:
            return res.group(1)
    
    
    def main(start):
        html = get_one_page(start)
        res = get_one_data(html)
        for each in res:
            write_to_file(each)
            save_img(each)
    
    
    if __name__ == '__main__':
        start = [n * 25 for n in range(0, 1)]
        for i in start:
            main(i)
        j.close()

     

  • 相关阅读:
    Codeforces 834D The Bakery
    hdu 1394 Minimum Inversion Number
    Codeforces 837E Vasya's Function
    Codeforces 837D Round Subset
    Codeforces 825E Minimal Labels
    Codeforces 437D The Child and Zoo
    Codeforces 822D My pretty girl Noora
    Codeforces 799D Field expansion
    Codeforces 438D The Child and Sequence
    Codeforces Round #427 (Div. 2) Problem D Palindromic characteristics (Codeforces 835D)
  • 原文地址:https://www.cnblogs.com/francischeng/p/9433505.html
Copyright © 2011-2022 走看看