zoukankan      html  css  js  c++  java
  • python 爬虫煎蛋网

    import urllib.request
    import os
    from urllib import error
    import re
    import base64
    
    def url_open(url):
        req = urllib.request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0')
        try:
            response = urllib.request.urlopen(req)
        except error.HTTPError:
            print("有异常的url为:" + url)
            return ""
        else:
            html = response.read()
        return html
    
    def get_page(url):
        if url != "":
            html = url_open(url).decode('utf-8')
            a = html.find('current-comment-page') + 23
            b = html.find(']',a)
        return html[a:b]
    
    
    def find_image(url):
        image_addrs = []
        html = url_open(url).decode('utf-8')
        reg = r'class="img-hash">(.+)</span>'  # 正则表达式
        src_img = re.compile(reg)
        image_addrs_base64 = src_img.findall(html)
        for s in image_addrs_base64:
            image_addrs.append("http:"+ str(base64.b64decode(s).decode('utf-8')))#图片地址是用base64加密
        return image_addrs
    
    def save_image(image_addrs):
        for each in image_addrs:
            filename = each.split('/')[-1]
            with open("picture/"+filename,'wb') as f:
                img = url_open(each)
                f.write(img)
    
    def download_girls(pages = 20):
        url = 'http://jandan.net/ooxx/'
        page_num = int(get_page(url))
        for i in range(pages):
            page_num -= 1
            page_url = url + 'page-' + str(page_num) + '#comments'
            image_addrs = find_image(page_url)
            save_image(image_addrs)
    
    if __name__ == '__main__':
        download_girls()
    
    print("执行结束")
  • 相关阅读:
    [转]zookeeper-端口说明
    ACM-ICPC(9/26)
    ACM-ICPC(9/25)
    Linux的文件权限与目录配置
    Uva 11468 AC自动机或运算
    Uva 11922 Splay
    HDU 6214 最小割边
    Uva 10559 消除方块
    HDU 6194 后缀数组
    Uva 11491 暴力贪心
  • 原文地址:https://www.cnblogs.com/dengnapianhuahai/p/10056938.html
Copyright © 2011-2022 走看看