zoukankan      html  css  js  c++  java
  • 花瓣网爬虫

    notes:

    1、 参考https://www.cnblogs.com/nan86150/p/4272452.html 提供的思路

    2、 思路:

      使用requests库得到起始地址https://huaban.com/favorite/beauty/

      第一次匹配网页中的   "pin_id":(d*?), "user_id":d*? 得到当前界面的所有pin_id

      通过id构造地址,进入网页下载高清图片

      进入网页后解析网页对信息筛选

       第一次筛选:  app["page"](.*?), "text_meta" 内容 #因为网页中id有重复,所以第一次筛选缩小范围

      第二次:提取第一次的内容,得到id     

      第二次提取其中的key

      通过key构造图片地址进行下载

      翻页:

      为满足异步加载,构造地址

      https://huaban.com/favorite/beauty/?ivhdm0s5&max=914688397&limit=20&wfl=1
      max后的值是页面最后一个id,所以每次页面下载完后就返回最后一个id,重新构造地址继续下载
    3、创建文件函数是通过当前时间来创建文件夹
    4、代码格式较混乱,欢迎交流
    5、下载500张左右会封锁ip,需要更换ip,暂时未设置更换


    
    
    6、下载文件

    import
    requests imgurl = 'http://img.ivsky.com/img/tupian/t/201411/01/xiaohuangren_tupian-007.jpg' imgdata = requests.get(imgurl).content with open('s.jpg', 'wb') as f: f.write(imgdata)
    #!/usr/bin/env python
    # encoding: utf-8
    import requests
    import re
    import os
    import time
    import urllib.request
    
    def createFile():
        global path
        filetime = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))
        path = './flower_beauty/' + filetime
        if not os.path.exists('./flower_beauty'):
            os.mkdir('./flower_beauty')
    
        if not os.path.exists(path):
            os.mkdir(path)
        return path
    
    def downloadimg(key,path):
    
        imgurl='https://hbimg.b0.upaiyun.com/'+str(key)[2:-2]+'_fw658'
        urllib.request.urlretrieve(imgurl,path+'/'+str(key)[2:-2]+'.jpg')
        # print(imgurl)
    def validname(title):
        rstr='[/\:*?"<>|]'
        valid_title=re.sub(rstr,'',title)
        return valid_title
    def get_img_key(id):
        #得到图片id
        home_url='https://huaban.com/pins/'
        url=home_url+str(id)+'/'
        webdata=requests.get(url)
        data=webdata.text
        #数据筛选找到key
        firstRE = re.compile(r'app["page"](.*?), "text_meta"')
        firstdata = firstRE.findall(data)
        # print(firstdata)
        key=re.findall('"key":"(.*?)"',str(firstdata))
        # print(key)
        return key
    
        #对key构造地址
    #749a992fb659e939ebc4f5690da60a81fed54c405068e-Illc20_fw658
        # with open('./tempdata.txt','w',encoding='utf-8') as tempw:
        #     tempw.write(data)
        #     tempw.close()
        # with open('./tempdata.txt','r+',encoding='utf-8') as tempr:
        #     data1=tempr.read()
        #     picurl = picRE.findall(data1,re.S)
        #     print(picurl)
        #     tempr.close()
    # def get_img_id():
        # web_data=requests.get("https://huaban.com/favorite/beauty/")
    
    
    
    def get_id(beauty_url):
        web_data = requests.get(beauty_url)
        pinRE = re.compile('"pin_id":(d*?), "user_id":d*?,')
        pinid = pinRE.findall(web_data.text)
        # a=0
        # for id in pinid:
        #     get_img(id)
        #     a+=1
        #     print("第{}个".format(a))
        return pinid
    
    def unique_id(id_list):
        id_list=list(set(id_list))
        return id_list
    
    
    def get_next(beauty_url='https://huaban.com/favorite/beauty/'):
        id_list = get_id(beauty_url)[:]  # 解析页面当前的id
        id_set = list(set(id_list))
        id_list_uniqune = unique_id(id_list)  # 对id列表去重
        # print(id_list_uniqune)
        #循环下载函数
        count=0
        for id in id_list_uniqune:
            count+=1
            key = get_img_key(id)  # 通过id得到key
            downloadimg(key, path)  # 通过key和path下载文件
            print("download {}".format(count))
            # if count>5:
            #     break
        return id_list[-1]
    '''
    198368485', '918789198', '198369677', '198363750', '350010205',
    '670115744', '172796713', '898076056', '917512201', '313298032',
    '918586402', '918920381', '525798891', '313304628', '918079727',
    '564055088', '898132640', '918784856', '917512519', '212946705',
    '918894995', '303222393', '904194691', '918944339', '918955317',
    '217881447', '350010146', '898133035', '198375016', '344794002',
    '898133172', '855305051', '918917448', '918978911', '918780514',
    '917512027', '918957316', '898132699', '918079809', '296216360', '
    394382196', '218761604', '670115745', '918991904', '670115742',
    '918944392', '917393726', '388327469', '346550851', '918973592',
    '918901743', '917511868', '313303957', '898133006', '313303569',
    '299703741', '204765314', '917395347', '918956902', '918788348',
    '206411589', '918081274', '802643792', '918896254', '670115743',
    '670115740', '918081360', '296218662', '918994440', '918585230',
    '913841090', '823423111', '918915500', '346550562', '383143365',
    '898133053', '918979535', '670115748', '802683220', '670115749',
    '898132715', '898133143', '346550402', '670115751', '802643684',
    '313295155', '297719115', '917511136']
    https://huaban.com/favorite/beauty/?ivhdm0s5&max=914688397&limit=20&wfl=1
    '''
    if __name__ == '__main__':
        # list1=get_next(beauty_url='https://huaban.com/favorite/beauty/')
        # list2=get_next(beauty_url='https://huaban.com/favorite/beauty/?ivhdm0s5&max=914688397&limit=20&wfl=1')
        # for li1 in list1:
        #     if li1 in list2:
        #         print("double {}".format(li1))
        #     else:
        #         print("none")
        path=createFile()#创建文件夹,返回路径供下载函数使用
        pages=100
        last_id=get_next()
        for page in range(1,pages):
            beauty_url = 'https://huaban.com/favorite/beauty/?ivhdm0s5&max={}&limit=20&wfl=1'.format(last_id)
            last_id=get_next(beauty_url)
    

      

  • 相关阅读:
    Linux 线程间通信方式+进程通信方式 总结
    使用opencv第三方库的makefile文件示例
    rplidar SDK 二次开发---之获取目标信息(0.1)
    #include "Target_orientation.h"
    opencv —— 调用摄像头采集图像 VideoCapture capture(0);
    cmake 支持-lpthread
    ROS下sensor_msgs::ImagePtr到sensor_msgs::Image之间的转换
    JAVA 校验身份证号码工具类(支持15位和18位)
    python面向对象游戏练习:好人坏人手枪手榴弹
    python 私有属性的作用
  • 原文地址:https://www.cnblogs.com/maoxianfei/p/6062089.html
Copyright © 2011-2022 走看看