zoukankan      html  css  js  c++  java
  • Python图片爬虫

    1.今天给大家介绍自己写的一个图片爬虫,说白了就是从网页自动上下载需要的图片

    2.首先选取目标为:http://www.zhangzishi.cc/涨姿势这个网站如下图,我们的目标就是爬取该网站福利社的所有美图

    3.福利社地址为http://www.zhangzishi.cc/category/welfare,获取图片就是获取所有网站图片的url地址,首先

    A.打开URL,获取html代码

    def url_open(url):
        req = urllib.request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
        response = urllib.request.urlopen(req)
        html = response.read()
        print('url_open')
        return html

    B.从html代码中摘取网页链接,返回的是一个列表

    def page_htmls(url,count):
        html = url_open(url).decode('utf-8')
        pages = []
        a = html.find('a target="_blank" href=')
        i = 0
        while a != -1:
            i += 1
            b = html.find('.html',a,a+200)
            if b != -1:
                pages.append(html[a+24:b+5])
            else:
                b = a + 24
            a = html.find('a target="_blank" href=',b)
            if i == count:
                break
        for each in pages:
            print(each)
        return pages

    C.从每一个链接页中获取图片地址,我这用了两种方法

    def find_imgs(url):
        html = url_open(url).decode('utf-8')
        imgs = []
    
        a = html.find('img src=')
        while a != -1:
            b = html.find('.jpg',a,a+100)
            if b != -1:
                if html[a+9:b+4].find('http') == -1:
                    imgs.append('http:'+html[a+9:b+4])
                else:
                    imgs.append(html[a+9:b+4])
            else:
                b = a + 9
            a = html.find('img src=',b)
            '''
        for each in imgs:
            print(each)
            '''
        return imgs
    
    
    
    
    def imgurl_get(url):
        html = url_open(url).decode('utf-8')
        imgurls = []
        a = html.find('color: #555555;" src=')
        while a != -1:
            b = html.find('.jpg',a,a+100)
            if b != -1:
                imgurls.append('http:'+html[a+22:b+4])
            else:
                b = a + 22
            a = html.find('color: #555555;" src=',b)
    
        return imgurls

    D.根据图片url下载图片到文件

    def save_imgs(folder,imgs):
        for ea in imgs:
            filename = ea.split('/')[-1]
            with open(filename,'wb') as f:
                img = url_open(ea)
                f.write(img)
                
    def download_mm(folder='H:\xxoo2',page_count = 100,count = 100):
        main_url = 'http://www.zhangzishi.cc/category/welfare'
        main_urls = []
        for i in range(count):
            main_urls.append(main_url+'/page/'+str(i+1))     
        os.mkdir(folder)
        os.chdir(folder)
        for url in main_urls:
            htmls = page_htmls(url,page_count)
            for page in htmls:
                imgurls = imgurl_get(page)
                
                save_imgs(folder,imgurls)

    E.开始下载

    def download__img(folder='H:\xxoo',page_count=100):
        main_url = 'http://www.zhangzishi.cc/category/welfare'
        os.mkdir(folder)
        os.chdir(folder)
        htmls = page_htmls(main_url,page_count)
        for page in htmls:   
           imgs_url =  find_imgs(page)
           
           save_imgs(folder,imgs_url)
           
    if __name__ == '__main__':
         
        download_mm()
        #download__img()

    F:下载结果

    顺便附上全部代码:

    import urllib.request
    import os
    
    def url_open(url):
        req = urllib.request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
        response = urllib.request.urlopen(req)
        html = response.read()
        print('url_open')
        return html
    
    def page_htmls(url,count):
        html = url_open(url).decode('utf-8')
        pages = []
        a = html.find('a target="_blank" href=')
        i = 0
        while a != -1:
            i += 1
            b = html.find('.html',a,a+200)
            if b != -1:
                pages.append(html[a+24:b+5])
            else:
                b = a + 24
            a = html.find('a target="_blank" href=',b)
            if i == count:
                break
        for each in pages:
            print(each)
        return pages
    '''
    
    '''
    def find_imgs(url):
        html = url_open(url).decode('utf-8')
        imgs = []
    
        a = html.find('img src=')
        while a != -1:
            b = html.find('.jpg',a,a+100)
            if b != -1:
                if html[a+9:b+4].find('http') == -1:
                    imgs.append('http:'+html[a+9:b+4])
                else:
                    imgs.append(html[a+9:b+4])
            else:
                b = a + 9
            a = html.find('img src=',b)
            '''
        for each in imgs:
            print(each)
            '''
        return imgs
    
    
    
    
    def imgurl_get(url):
        html = url_open(url).decode('utf-8')
        imgurls = []
        a = html.find('color: #555555;" src=')
        while a != -1:
            b = html.find('.jpg',a,a+100)
            if b != -1:
                imgurls.append('http:'+html[a+22:b+4])
            else:
                b = a + 22
            a = html.find('color: #555555;" src=',b)
    
        return imgurls
    '''
        for each in imgurls:
            print(each)
    '''
    
    def save_imgs(folder,imgs):
        for ea in imgs:
            filename = ea.split('/')[-1]
            with open(filename,'wb') as f:
                img = url_open(ea)
                f.write(img)
                
    def download_mm(folder='H:\xxoo2',page_count = 100,count = 100):
        main_url = 'http://www.zhangzishi.cc/category/welfare'
        main_urls = []
        for i in range(count):
            main_urls.append(main_url+'/page/'+str(i+1))     
        os.mkdir(folder)
        os.chdir(folder)
        for url in main_urls:
            htmls = page_htmls(url,page_count)
            for page in htmls:
                imgurls = imgurl_get(page)
                
                save_imgs(folder,imgurls)
    
    
            
    def download__img(folder='H:\xxoo',page_count=100):
        main_url = 'http://www.zhangzishi.cc/category/welfare'
        os.mkdir(folder)
        os.chdir(folder)
        htmls = page_htmls(main_url,page_count)
        for page in htmls:   
           imgs_url =  find_imgs(page)
           
           save_imgs(folder,imgs_url)
           
    if __name__ == '__main__':
         
        download_mm()
        #download__img()
  • 相关阅读:
    玲珑学院-ACM比赛1014
    扩展欧几里得算法
    中国剩余定理(孙子定理)及实现----原理详解
    搞懂树状数组
    HDU3792---Twin Prime Conjecture(树状数组)
    树状数组 模板
    HDU1541--Stars(树状数组)
    HDU4046--Panda(树状数组)
    CCF-201604-1-折点计数
    CCF-201604-2-俄罗斯方块
  • 原文地址:https://www.cnblogs.com/huipengbo/p/8186223.html
Copyright © 2011-2022 走看看