zoukankan      html  css  js  c++  java
  • 爬mei紫图最后代码2015-2019-1-14全部

    共用时间4小时20分左右,文件4.36g,运行产生一个文档,用来记录不纯在的图片代码,把这些代码删减去,实际代码很短,因为过程比较无脑。

    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 18:23:10 2019
    
    @author: Administrator
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    #import xlwt
    
    
    
    
    
    '''
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
                    
    sheet = book.add_sheet('test', cell_overwrite_ok=True)
                    
    sheet.write(0,0,link[-7:-4])
                    
    book.save(r'e:	est1.xls')
    '''
    
    #url="https://www.mzitu.com/169451"
    
    my_referer = r'https://www.mzitu.com/169451'
    
    
    
    #为了获得图片链接,暂时不需要了,因为链接可以生成
    '''
    r=requests.get(url,headers={'referer':my_referer})
    r.encoding=r.apparent_encoding
    html=r.content
    soup=BeautifulSoup(html,"html.parser")
    s=soup.select("div p a")[0].img["src"]
    '''
    
    #从链接总获得图片
    
    
    
    
    #s=soup.select(".article-content")
    
    #type(s[0])
    #Out[18]: bs4.element.Tag
    
    #t=s[0].get_text()
    
    
    #f=open("d:/测试解析文档学习.html","w",encoding="utf-8")
    #f.write(str(s))
    '''
    a='https://i.meizitu.net/2019/01/13d'
    
    b="https://i.meizitu.net/2018/12/29d"
    
    c="https://i.meizitu.net/2017/01/01b"
    d="https://i.meizitu.net/2017/01/02b"
    ls=[]
    ls=[a,b,c,d]
    '''
    
    
    
    
    
    """
    p1=["0"+str(i) for i in range(1,10)]    #快速列表生成器
    p1.append("10","11","12")  
    """
    
    
     
    site="https://i.meizitu.net/"     #2018
    
    years=[site+str(i)+"/" for i in range(2015,2020)]    #产生这几年
    
    
                                  #1-31天
        
        
    #p2=[chr(i) for i in range(97,123)]          #百度python生成a-z
    
    #        for j in p2:
    
    #经过分析大多数图片都是 https://i.meizitu.net/2018/12/28a01.jpg 末尾的字母主要是 a b c 所以,为了效率,节省点时间吧
    
    
    
    
    def  nyr(year):
        p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)]   #
        p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)]   #产生31天    
        
        url_day=[]
        for k in p0:    
            for i in p1:
                    url_day.append(year+k+'/'+i)     #产生某一天
    
        return url_day
    #这样遍历的全年的不太好用,还不如老老实实爬某一个月的
    """
    p12="https://i.meizitu.net/2018/10/"
    url_Nov=[]
    for k in p1:    
        for i in p2: 
             url_Nov.append(p12+k+i)
    
    
    """
    
    header = {
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",  
    "Referer": r'https://www.mzitu.com/169451'
    }  
    
    
    
    #headers={'referer':my_referer}
    
    
    def downloud(url):
        #p2015=["m","p","s","t","u","w","x","y"]
        p2015=[chr(i) for i in range(97,123)]  
        for j in p2015:
            for i in range(1,60):
                if i<10:
                    link=url+j+"0"+str(i)+".jpg"
                else:
                    link=url+j+str(i)+".jpg"
                    
                
                try:
                    r1=requests.get(link,timeout=0.1,headers=header)
                    r1.raise_for_status()
                    html1=r1.content
                    #ss=str(i)
                    f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb")
                    f.write(html1)
                    f.close()
                except:
                    k=open("f:/爬虫生成文件/爬虫字母.txt","a",encoding="utf-8")
                    k.write(link[-7:-4]+",")
                    print("不存在:","{:^10}".format(link))
                    #k.close()
                    break
                
    
    def main():
        start_time=time.time()
        
        for j in range(len(years)):
            n=nyr(years[j])
            for i in range(len(n)):
                downloud(n[i])
        '''
        n=nyr(years[0])
        for i in range(len(n)):
            downloud(n[i])
        '''
        end_time=time.time()
        print("{:10}".format(end_time-start_time))
          
        
    main()
    
    
    
    
    
    
    
    """
    
    
    
    def main():
        for i in url_Nov[:200]:
            downloud(i)    
            
    """
    
    
    """
    
    对空文件测试
    x
    Out[107]: 'https://i.meizitu.net/2018/12/12o1'
    
    r1=requests.get(x,headers={'referer':my_referer})
    
    r1
    Out[109]: <Response [404]>
    
    
    明白了这里必须要加上try except ,,raisefor status的原因了
    
    
    
    url[-11:].replace("/","")
    """         
           
    
            
    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 18:23:10 2019
    @author: Administrator
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    
    
    def  nyr(year):         #当给定第year年 产生year年的每一天
        p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)]   #
        p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)]   #产生31天    
        
        url_day=[]
        for k in p0:    
            for i in p1:
                    url_day.append(year+k+'/'+i)    
        return url_day
    
    
    
    def downloud(url):        #url是给定的某年某月某日 该函数遍历a00-a79 b00-b79....z00-z79 然后传入下一日
        p2=[chr(i) for i in range(97,123)]  
        for j in p2:         #26字母遍历
            for i in range(1,80):              #查找80张图片(一般一张专辑最多也就50章),这个数目大没关系,不会降低速度,因为如果没有立刻就break了,不会继续查找
               
               if i<10:                       #因为都是两位数 比如3应该转化为03
                    link=url+j+"0"+str(i)+".jpg"
                else:
                    link=url+j+str(i)+".jpg"
                    
                try:                            #下载图片
                    r=requests.get(link,timeout=0.1,headers=header)
                    r.raise_for_status()
                    html=r.content
                    f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb")
                    f.write(html)
                    f.close()
                except:
                    print("不存在:","{:^10}".format(link))
                    break
                
    
    def main():
        my_referer = r'https://www.mzitu.com/169451'         
        site="https://i.meizitu.net/"     
        header = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
        "Referer": r'https://www.mzitu.com/169451'
        }  
        years=[site+str(i)+"/" for i in range(2015,2020)]  
        start_time=time.time()  
        for j in range(len(years)):        #循环遍历每一年
            n=nyr(years[j])
            for i in range(len(n)):        #循环遍历该年的每一天,从第0天开始
                downloud(n[i])
        end_time=time.time()
        print("{:10}".format(end_time-start_time))    #计算总共用时间
      
    main()
    
    
    
    
    
    
    
    
            

  • 相关阅读:
    撒谎
    可怜的猪
    GIS学习笔记(五)
    国产木马冰河2.2
    矛盾
    GIS学习笔记(六)
    男人如衣服
    VS2005快捷键大全
    慧悟
    DOS命令
  • 原文地址:https://www.cnblogs.com/xinqidian/p/10276391.html
Copyright © 2011-2022 走看看