zoukankan      html  css  js  c++  java
  • python学习笔记(12)--爬虫下载煎蛋网图片

    说明:

    1. 这个其实是在下载漫画之前写的,比那个稍微简单点,之前忘放到博客备份了。

    2. 不想说啥了,总结放到漫画那个里面吧!

     1 import urllib.request
     2 import re
     3 import os
     4 
     5 # http://jandan.net/ooxx/page-2381#comments
     6 # <span class="current-comment-page">[2381]</span>
     7 # <img src="//wx4.sinaimg.cn/orj360/797ccd21gy1fdcjecuo1jj20qo0usacj.jpg" style="max- 480px; max-height: 750px; background-color: rgb(246, 161, 181);">
     8 # <a href="//ww1.sinaimg.cn/large/6715afcfgw1ef4zrjdaswj20js0qotag.jpg" target="_blank" class="view_img_link">[查看原图]</a>
     9 url = "http://jandan.net/ooxx/page-2381#comments"
    10 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"}
    11 
    12 # 打开主网站url,获取整个html字符串
    13 req = urllib.request.Request(url=url,headers=headers)
    14 response = urllib.request.urlopen(req)
    15 html = response.read().decode("utf-8")
    16 # print(html[0:1000])
    17 # 分析html字符串,找出页数和图片地址
    18 page = html.find("current-comment-page")
    19 page = html[page+23:page+27]
    20 # print(page)
    21 htmlPages = ""
    22 for i in range(int(page)-10,int(page)):
    23     urlPage = "http://jandan.net/ooxx/page-"+str(i)+"#comments"
    24     reqPage = urllib.request.Request(url=urlPage,headers=headers)
    25     responsePage = urllib.request.urlopen(reqPage)
    26     htmlPages += responsePage.read().decode("utf-8")
    27 regImg = r"//[0-9a-z]+.sinaimg.cn/large/[0-9a-z]+.jpg"
    28 imgUrl = re.findall(regImg,htmlPages)
    29 # print(imgUrl)
    30 imgNum = len(imgUrl)
    31 # print(imgNum)
    32 # 创建文件夹
    33 os.mkdir("test")
    34 # 切换到这个文件夹
    35 os.chdir("test")
    36 
    37 
    38 # 打开每个图片地址,保存图片到本地
    39 for i in range(imgNum):
    40     req = urllib.request.Request(url="http:"+imgUrl[i],headers=headers)
    41     responseImg = urllib.request.urlopen(req)
    42     img = open(str(i)+".jpg","wb")
    43     img.write(responseImg.read())
    44     img.close

     小甲鱼源码(论坛里复制来的,其实是可以运行的,每个图片地址加上http:就可以了):

     1 import urllib.request
     2 import os
     3 import random
     4 # 煎蛋网已经禁用爬虫了,所以此程序无法运行
     5 def url_open(url):
     6     req = urllib.request.Request(url)
     7     req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36')
     8     # iplist = ['111.197.141.57:9797','116.228.236.219:8080','120.26.51.101:8118','113.222.80.216:3128','117.90.1.88:9000']
     9     # proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    10     # opener = urllib.request.build_opener(proxy_support)
    11     # urllib.request.install_opener(opener)
    12     response = urllib.request.urlopen(url)
    13     html = response.read()
    14     return html
    15 
    16 def get_page(url):
    17     html = url_open(url).decode('utf-8')
    18     a = html.find('current-comment-page') + 23
    19     b = html.find(']',a)
    20     return html[a:b]
    21 
    22 def find_imgs(url):
    23     html = url_open(url).decode('utf-8')
    24     img_addrs = []
    25     a = html.find('img src=')
    26     while a != -1:
    27         b = html.find('.jpg', a, a + 100)
    28 
    29         if b != -1:
    30             img_addrs.append(html[a+9:b+4])
    31             print('图片地址:'+html[a+9:b+4])
    32         else:
    33             b = a + 9
    34         a = html.find('img src=', b)
    35     return img_addrs
    36 def save_imgs(folder, img_addrs):
    37     for each in img_addrs:
    38         filename = each.split('/')[-1]
    39         with open(filename, 'wb') as f:
    40             img = url_open("http:"+each)
    41             f.write(img)
    42 
    43 def download_mm(folder = 'Xman', pages = 1):
    44     os.mkdir(folder)
    45     os.chdir(folder)
    46     url = "http://jandan.net/ooxx/"
    47     page_num = int(get_page(url))
    48     for i in range(pages):
    49         page_num -= i
    50         page_url = url + 'page-' + str(page_num) + '#comments'
    51         img_addrs = find_imgs(page_url)
    52         save_imgs(folder, img_addrs)
    53 if __name__ == '__main__':
    54     download_mm()
  • 相关阅读:
    Yahoo团队经验:网站性能优化的34条黄金法则
    SaltStack中状态间关系unless、onlyif、require、require_in、watch、watch_in
    kubectl 常用命令一
    Linux系统的限制
    DNS服务器搭建与配置
    Python对文件的操作
    SaltStack schedule功能
    saltstack的salt-api介绍
    SaltStack事件驱动 – event reactor
    SaltSack 中Job管理
  • 原文地址:https://www.cnblogs.com/Jacklovely/p/6513353.html
Copyright © 2011-2022 走看看