zoukankan      html  css  js  c++  java
  • 【爬虫】花瓣采集下载器

    做UI的朋友说花瓣访问不了, 但是个人采集还是能获得的。
    赶紧下载下来备份吧。

    花瓣采集下载器for windows

    提取码: muy1
    by the way ,接口可能变动,2019年1月29日 验证有效。

    python3+requests

    # -*- encoding:utf-8 -*-
    '''
       author:thewindkee
    '''
    import requests
    import urllib
    import json
    import re
    import time
    import random
    # import queue
    import os
    import sys
    
    INDEX_URL='http://login.meiwu.co/xx'
    USER_PIN_URL_FORMAT='http://login.meiwu.co/%s/pins/'
    DOWNLOAD_URL='http://img.hb.aicdn.com/'
    
    global HEADERS
    HEADERS={
        "User-Agent:":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
        "Accept:":"application/json"
    }
    #所有的采集
    global PIN_MAP
    PIN_MAP={}
    #下载失败的采集
    global FAILED_TO_DOWN
    FAILED_TO_DOWN=[]
    FAILED_TXT="fail.txt"
    #保存登陆后的cookie
    global LOGIN_COOKIES
    LOGIN_COOKIES={}
    seed = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    global EXISTED_PIC
    EXISTED_PIC=[]
    PIC_POSTFIX='.jpg'
    DOWANLOAD_DIR='huaban'
    
    
    def login(email,password):
        LOGIN_URL = "http://login.meiwu.co/auth/"
        r=requests.post(LOGIN_URL, headers=HEADERS,data = {'email':email,'password':password,'_ref':'loginPage'},timeout=10)
        if r.status_code == requests.codes.ok:
            # print("cookies:")
            for key,value in r.cookies.items():
                # print('%s,%s'%(key,value))
                LOGIN_COOKIES[key]=value
            urlname=get_urlname(r.text)
            if urlname:
                return USER_PIN_URL_FORMAT%urlname
        print("登陆失败")
        sys.exit(0)
    
    
    def build_url_for_test():
        LOGIN_COOKIES['sid']='xx'
        urlname='xx'
        return USER_PIN_URL_FORMAT%urlname
    
    def get_pin_max(content):
        # "pin_count": 3466
        r = re.search('"pin_id": *(d+)', content)
        # print(r)
        if r:  # 有id才录入
            return r.group(1)
    
    def get_urlname(content):
        r = re.search('"urlname": *"(S+)"', content)
        # print(r)
        if r:
            return r.group(1)
    
    def randomStr(len):
        return ''.join(random.sample(seed, len))
    
    
    def build_headers_for_pin():
        HEADERS['X-Requested-With']='XMLHttpRequest'
        HEADERS['X-Request']='JSON'
        return HEADERS
    
    
    def get_page_pins(user_pin_url,max):
        try:
            user_pin_url='%s?%s&max=%s&limit=100&wfl=1'%(user_pin_url,randomStr(8),max)
            print('收集下载信息 url:%s'%user_pin_url)
            r=requests.get(user_pin_url, headers=HEADERS,cookies=LOGIN_COOKIES,timeout=30)
            r.encoding='UTF-8'
            last_pin_id = max
            if r.status_code == requests.codes.ok:
                d = json.loads(r.text,encoding="UTF-8")
                pins = d['user']['pins']
                if(pins):
                    for pinItem in pins:
                        last_pin_id=pinItem['pin_id']
                        PIN_MAP[str(last_pin_id)]=pinItem['file']['key']
                    return last_pin_id
                else:
                    return None
        except Exception as e:
            print(str(e))
    
    def save_pin_map(data):
        with open("all.txt",'w') as f:
            f.write(str(data))
    
    def download(url,name):
        try:
            r=requests.get(url, headers=HEADERS)
            with open(name,'wb') as f:
                f.write(r.content)
        except Exception as e:
            print(str(e))
            FAILED_TO_DOWN.append(url)
    
    def download_all(total):
        i=1
        for pin_id,key in PIN_MAP.items():
            url=DOWNLOAD_URL+key
            print('%d/%d 下载 %s'%(i,total,url))
            i+=1
            if(not downloaded(pin_id)):
                download(url,pin_id+PIC_POSTFIX)
                time.sleep(0.5)
    
    
    def downloaded(pin_id):
        # path=os.getcwd()+os.sep+pin_id+PIC_POSTFIX
        file_name=pin_id+PIC_POSTFIX
        if file_name in EXISTED_PIC:
            print('	%s已经存在'%file_name)
            return True
        else:
            return False
    
    def save_failed_to_down_url():
        with open(FAILED_TXT,'w') as f:
            f.write(",".join(FAILED_TO_DOWN) )
    
    
    def prepare_pic_dir(DOWANLOAD_PATH):
        if not os.path.exists(DOWANLOAD_PATH): 
            print("创建下载目录:%s"%DOWANLOAD_PATH)
            os.makedirs(DOWANLOAD_PATH)
        os.chdir(DOWANLOAD_PATH)
    
    def get_first_max(user_pin_url_index):
        r=requests.get(user_pin_url_index, headers=HEADERS,cookies=LOGIN_COOKIES,timeout=30)
        #为了获得max那一张
        return int(get_pin_max(r.text))+1
    
    
    def main():
        try:
            # urlname=build_url_for_test()
            DOWANLOAD_PATH=(os.getcwd()+os.sep+DOWANLOAD_DIR).strip()
            print("下载花瓣采集到文件夹:%s "%DOWANLOAD_PATH)
            EMAIL=input('请输入账号
    ')
            PASSWORD=input('请输入密码
    ')
            USER_PIN_URL=login(EMAIL,PASSWORD)
            MAX=get_first_max(USER_PIN_URL)
            print(MAX)
            build_headers_for_pin()
            while(True):
                 MAX=get_page_pins(USER_PIN_URL,MAX)
                 time.sleep(0.5)
                 if not MAX:
                    break
            prepare_pic_dir(DOWANLOAD_PATH)
            build_existed_pic(DOWANLOAD_PATH)
            save_pin_map(PIN_MAP)
            total=len(PIN_MAP)
            print('总共:%d张,开始下载!'%total)
            download_all(total)
            if FAILED_TO_DOWN:
                print("%s张下载失败,查看%s文件"%(len(FAILED_TO_DOWN),FAILED_TXT))
            else:
                print("下载完成!")
            print('图片下载目录:%s'%DOWANLOAD_PATH)
            save_failed_to_down_url()
        except Exception as e:
            print(str(e))
    
    def build_existed_pic(DOWANLOAD_PATH):
        for file in os.listdir(DOWANLOAD_PATH):
            if os.path.isfile(file) and file.endswith(PIC_POSTFIX):
                EXISTED_PIC.append(file)
    
    
    if __name__=='__main__':
        try:
            main()
        finally:
            input("任意键退出")
    

    效果如图:
    在这里插入图片描述

  • 相关阅读:
    关于For循环的性能
    CLR读书笔记
    轻量级自动化测试框架介绍
    loadrunner中如何将MD5加密的值转换为大写
    LoadRunner 中实现MD5加密
    新安装的soapui启动时报错及解决方法
    单元测试之驱动模块和桩模块的作用和区别
    接口自动化(Python)-利用正则表达式从返回的HTML文本中截取自己想要的值
    LoadRunner性能测试-loadrunner事务
    LoadRunner性能测试-loadrunner工具破解
  • 原文地址:https://www.cnblogs.com/thewindkee/p/12873155.html
Copyright © 2011-2022 走看看