zoukankan      html  css  js  c++  java
  • 利用小虫虫做一枚合格宅男,果然牡丹花下做鬼也风流

    一到开学季,校园里面又开始散发着爱情的酸腐,与其一个人在寝室默默地哭泣,不如上网看看比外面更漂亮的妹子!

    嗯,想到这,我忍不住打开了http://jandan.net/ooxx,捏着手中的精灵球,好,今天的少女图鉴就从煎蛋网开始吧!!!冲鸭

    打开网页,目标定格在了最后一页

    先用f12看一下

    发现了,上吧小虫虫,就决定是你了,使用抓取源码!

    命中了,煎蛋网交出了源代码

    好,那就一鼓作气,小虫虫,解析它!

    很狡猾的网站,我们get到的是[40],所以最后要加一个切片,即取第二个到倒数第二个的那个数字就可以了

    现在封死你的退路,把所有的url都封装到一个列表中,我们把初始的网址设为base_url

    哼看你这下子往哪里跑,现在让我看看妹子们都躲到哪里去了嘎嘎嘎

    试试新学的css选择器,右键copy一下

     

     

    只得到了第一张妹子的图片信息,再加把劲,去掉一点限制条件

    都跑出来惹,可是怎么都一个样?原来这些都是被js封装了

    网页设置了 Anti creeper机制,我们并不能顺利地爬取网页上的内容,可以使用selenium对数据进行爬取

    from selenium import webdriver
    import requests
    from bs4 import BeautifulSoup
    path="C:/Users/12958/Desktop/douban/" # 保存图片的路径
    urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i))for i in range(38, 40)] # 需要爬取的url的地址,这里是爬了38 39两页
    driver = webdriver.Chrome()
    img_url=[]

    首先还是一些基础的设置

    for url in urls:
        driver.get(url)
        # 网页源码
        data = driver.page_source
        # 解析网页
        soup = BeautifulSoup(data, "lxml")
        # 定位元素
        images = soup.select("a.view_img_link")
        # print(images)
        for image in images:
            dynamic = image.get('href')
            if str('gif') in str(dynamic): # 去除gif
                pass
            else:
                http_url = "http:" + dynamic
                img_url.append(http_url)
                # print("http:%s" % dynamic)
        for j in img_url:
            r= requests.get(j)
            print('正在下载 %s' % j)
            with open(path+j[-15:],'wb')as jpg:
                jpg.write(r.content)

    这里把gif的格式去除了

    试一试运行效果

    这么多好看的妹子啊,要看自己打一遍代码嘻嘻嘻

    ojbk,下面贴出完整代码

    from selenium import webdriver
    import requests
    from bs4 import BeautifulSoup
    path="C:/Users/12958/Desktop/douban/" # 保存图片的路径
    urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i))for i in range(38, 40)] # 需要爬取的url的地址,这里是爬了38 39两页
    driver = webdriver.Chrome()
    img_url=[]
    for url in urls:
        driver.get(url)
        # 网页源码
        data = driver.page_source
        # 解析网页
        soup = BeautifulSoup(data, "lxml")
        # 定位元素
        images = soup.select("a.view_img_link")
        # print(images)
    
        for image in images:
            dynamic = image.get('href')
            if str('gif') in str(dynamic): # 去除gif
                pass
            else:
                http_url = "http:" + dynamic
                img_url.append(http_url)
                # print("http:%s" % dynamic)
        for j in img_url:
            r= requests.get(j)
            print('正在下载 %s' % j)
            with open(path+j[-15:],'wb')as jpg:
                jpg.write(r.content)

    好的,今天你也很努力了小虫虫,回来休息吧。

    到网上找了一个类似的,更高级的,人家直接破解惹,牛牛牛!

    import hashlib
    import base64
    from bs4 import BeautifulSoup
    import requests
    import re
    import random
    import shutil
    import os
    import time
    import queue
    import threading
    import math
    
    '''
    url解码
    '''
    
    
    def parse(imgHash, constant):
    
        '''
        以下是原来的解码方式,近日(2018/5/25)已被修改不再生效
        q = 4
        hashlib.md5()
        constant = md5(constant)
        o = md5(constant[0:16])
        n = md5(constant[16:32])
        l = imgHash[0:q]
        c = o + md5(o + l)
        imgHash = imgHash[q:]
        k = decode_base64(imgHash)
        h =list(range(256))
        b = list(range(256))
        for g in range(0,256):
            b[g] = ord(c[g % len(c)])
        f=0
        for g in range(0,256):
            f = (f+h[g]+b[g]) % 256
            tmp = h[g]
            h[g] = h[f]
            h[f] = tmp
    
        result = ""
        p=0
        f=0
        for g in range(0,len(k)):
            p = (p + 1) % 256;
            f = (f + h[p]) % 256
            tmp = h[p]
            h[p] = h[f]
            h[f] = tmp
            result += chr(k[g] ^ (h[(h[p] + h[f]) % 256]))
    
        result = result[26:]
        return result
        '''
        return decode_base64(imgHash).decode('utf8')
    
    
    
    
    def md5(src):
        m = hashlib.md5()
        m.update(src.encode("utf8"))
        return m.hexdigest()
    
    
    def decode_base64(data):
        missing_padding = 4 - len(data) % 4
        if missing_padding:
            data += '=' * missing_padding
        return base64.b64decode(data)
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    
    '''
    页面抓取类
    '''
    
    
    class Spider(threading.Thread):
    
        def __init__(self, pages, proxies, url_manager):
            threading.Thread.__init__(self)
            self.pages = pages
            self.proxies = proxies
            self.url_manager = url_manager
    
        def get_Page(self, page, proxies, url_manager):
            bs_page = BeautifulSoup(page, "lxml")
    
            '''
            获取js文件地址从而得到constant常量
            '''
            try:
                model = re.findall(r'.*<scriptssrc="//(cdn.jandan.net/static/min.*?)"></script>.*', page)
                jsfile_url = "http://" + model[len(model) - 1]  # 页面上可能有两个地址,取最后一个匹配的地址
            except Exception as e:
                print(e)
            jsfile = requests.get(jsfile_url, headers=headers, proxies=proxies, timeout=3).text
    
            constant = re.search(r'.*remove();varsc=w+(e,"(w+)".*', jsfile).group(1)
            '''
            向parse函数传入constant常量和img-hash得到图片地址
            '''
            for item in bs_page.select('.img-hash'):
                img_url = 'http:' + parse(item.text, constant)
                url_manager.addNewUrl(img_url)
    
        def run(self):
            for page in self.pages:
                self.get_Page(page, self.proxies, self.url_manager)
    
    
    '''
    程序入口
    '''
    
    
    def main(amount):
        url_manager = UrlManager()
        proxies = {'http': ''}  # 尚未添加ip代理功能,程序已能正常运行
    
        current_url = 'http://jandan.net/ooxx'  # 当前页面url
        '''
        多线程抓取页面地址
        '''
        pages = []  # 所有待抓取页面
        try:
            for i in range(amount):
                current_page = requests.get(current_url, headers=headers).text  # 当前页面源码
                pages.append(current_page)
                current_url = 'http:' + re.search(r'.*OldersComments"shref="(.*?)"sclass.*', current_page).group(
                    1)  # 提取下个页面url
        except Exception as e:
            pass
    
        page_threads = []
        t_amount = 10 if len(pages) > 10 else len(pages)  # 页面抓取线程数
        for i in range(t_amount):
            t = Spider(pages[math.ceil(int((len(pages)) / t_amount) * i):math.ceil(int((len(pages)) / t_amount) * (i + 1))],
                       proxies, url_manager)
            page_threads.append(t)
        for t in page_threads:
            t.start()
        for t in page_threads:
            t.join()
    
        img_threads = []
        for i in range(10):  # 固定10个线程用于下载图片
            t = Download(url_manager)
            img_threads.append(t)
        for t in img_threads:
            t.start()
        for t in img_threads:
            t.join()
    
    
    L = threading.Lock()
    
    '''
    图片下载类
    '''
    
    
    class Download(threading.Thread):
        def __init__(self, url_manager):
            threading.Thread.__init__(self)
            self.url_manager = url_manager
            self.pic_headers = headers
            self.pic_headers['Host'] = 'wx3.sinaimg.cn'
    
        def download_Img(self, url):
            isGif = re.match(r'(.*.sinaimg.cn/)(w+)(/.+.gif)', url)
            if isGif:
                url = isGif.group(1) + 'large' + isGif.group(3)
    
            extensionName = re.match(r'.*(.w+)', url).group(1)  # 图片扩展名
    
            L.acquire()
            if not os.path.exists('img'):
                os.mkdir('img')
            with open('img/' + str(len(os.listdir('./img'))) + extensionName, 'wb') as f:
                # headers['Host']='wx3.sinaimg.cn'
                f.write(requests.get(url, headers=self.pic_headers).content)
                f.close()
            L.release()
    
        def run(self):
            while not self.url_manager.isEmpty():
                imgUrl = self.url_manager.getNewUrl()
                self.download_Img(imgUrl)
                self.url_manager.addOldUrl(imgUrl)
    
    
    '''
    url仓库,提供url更新以及记录功能
    '''
    
    
    class UrlManager:
        def __init__(self):
            self.url_used = []
            self.url_target = queue.Queue()
            if os.path.exists('url.txt'):
                with open('url.txt', 'r') as f:
                    for eachline in f.readlines():
                        self.url_used.append(eachline.strip())
            else:
                open("url.txt", 'w')
    
        def getNewUrl(self):
            return self.url_target.get()
    
        def isEmpty(self):
            return self.url_target.empty()
    
        def addNewUrl(self, newUrl):
            if newUrl in self.url_used:
                pass
            else:
                self.url_target.put(newUrl)
    
        def addOldUrl(self, oldUrl):
            self.url_used.append(oldUrl)
            with open('url.txt', 'a') as f:
                f.write(oldUrl + '
    ')
    
    
    if __name__ == '__main__':
    
    
    
        amount = input('请输入抓取页数后按回车开始(小于100),从首页开始计数):')
        main(int(amount))  # 抓取首页开始的前amount页的图片

    亲测可用,只要输入页数就可以了,不过好像最新的爬不了,还要研究一下

     统统打码不给你们这些大猪蹄子看!!

  • 相关阅读:
    PTA L1-002 打印沙漏 (20分)
    音乐研究
    LeetCode 155. 最小栈
    LeetCode 13. 罗马数字转整数
    LeetCode 69. x 的平方根
    LeetCode 572. 另一个树的子树
    errno错误号含义
    僵尸进程和孤儿进程
    TCP和UDP相关概念
    图相关算法
  • 原文地址:https://www.cnblogs.com/xingnie/p/9585328.html
Copyright © 2011-2022 走看看