zoukankan      html  css  js  c++  java
  • 一个爬虫的练习(妹子图)

    一个爬虫项目(抓妹子的图)

    url加密了,这儿用base64解密,js 自带一个token(解开图片的url地址) 

    话不多说 看源码:

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import hashlib
    import base64
    from bs4 import BeautifulSoup
    import requests
    import re
    import os
    import queue
    import threading
    import math
    from multiprocessing import Pool
    import sys
    sys.stderr = None
    '''
    url解码
    '''
    
    
    def parse(imgHash, constant):
        return decode_base64(imgHash).decode('utf8')
    
    def md5(src):
        m = hashlib.md5()
        m.update(src.encode("utf8"))
        return m.hexdigest()
    
    
    def decode_base64(data):
        missing_padding = 4 - len(data) % 4
        if missing_padding:
            data += '=' * missing_padding
        return base64.b64decode(data)
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    
    '''
    页面抓取类
    '''
    
    
    class Spider(threading.Thread):
    
        def __init__(self, pages, proxies, url_manager):
            threading.Thread.__init__(self)
            self.pages = pages
            self.proxies = proxies
            self.url_manager = url_manager
    
        def get_Page(self, page, proxies, url_manager):
            bs_page = BeautifulSoup(page, "lxml")
    
            '''
            获取js文件地址从而得到constant常量
            '''
            try:
                model = re.findall(r'.*<scriptssrc="//(cdn.jandan.net/static/min.*?)"></script>.*', page)
                jsfile_url = "http://" + model[len(model) - 1]  # 页面上可能有两个地址,取最后一个匹配的地址
            except Exception as e:
                print(e)
            jsfile = requests.get(jsfile_url, headers=headers, proxies=proxies, timeout=3).text
    
            constant = re.search(r'.*remove();varsc=w+(e,"(w+)".*', jsfile).group(1)
            '''
            向parse函数传入constant常量和img-hash得到图片地址
            '''
            for item in bs_page.select('.img-hash'):
                img_url = 'http:' + parse(item.text, constant)
                url_manager.addNewUrl(img_url)
    
        def run(self):
            for page in self.pages:
                self.get_Page(page, self.proxies, self.url_manager)
    
    
    '''
    程序入口
    '''
    
    
    def main(amount):
        url_manager = UrlManager()
        proxies = {'http': ''}  # 尚未添加ip代理功能,程序已能正常运行
    
        current_url = 'http://jandan.net/ooxx'  # 当前页面url
        '''
        多线程抓取页面地址
        '''
        pages = []  # 所有待抓取页面
        try:
            for i in range(amount):
                current_page = requests.get(current_url, headers=headers).text  # 当前页面源码
                pages.append(current_page)
                current_url = 'http:' + re.search(r'.*OldersComments"shref="(.*?)"sclass.*', current_page).group(
                    1)  # 提取下个页面url
        except Exception as e:
            pass
    
        page_threads = []
        t_amount = 10 if len(pages) > 10 else len(pages)  # 页面抓取线程数
        for i in range(t_amount):
            t = Spider(pages[math.ceil(int((len(pages)) / t_amount) * i):math.ceil(int((len(pages)) / t_amount) * (i + 1))],
                       proxies, url_manager)
            page_threads.append(t)
        for t in page_threads:
            t.start()
        for t in page_threads:
            t.join()
    
        img_threads = []
        for i in range(10):  # 固定10个线程用于下载图片
            t = Download(url_manager)
            img_threads.append(t)
        for t in img_threads:
            t.start()
        for t in img_threads:
            t.join()
    
    
    L = threading.Lock()
    
    '''
    图片下载类
    '''
    
    
    class Download(threading.Thread):
        def __init__(self, url_manager):
            threading.Thread.__init__(self)
            self.url_manager = url_manager
            self.pic_headers = headers
            self.pic_headers['Host'] = 'wx3.sinaimg.cn'
    
        def download_Img(self, url):
            isGif = re.match(r'(.*.sinaimg.cn/)(w+)(/.+.gif)', url)
            if isGif:
                url = isGif.group(1) + 'large' + isGif.group(3)
    
            extensionName = re.match(r'.*(.w+)', url).group(1)  # 图片扩展名
    
            L.acquire()
            if not os.path.exists('img'):
                os.mkdir('img')
            with open('img/' + str(len(os.listdir('./img'))) + extensionName, 'wb') as f:
                # headers['Host']='wx3.sinaimg.cn'
                f.write(requests.get(url, headers=self.pic_headers).content)
                f.close()
            L.release()
    
        def run(self):
            while not self.url_manager.isEmpty():
                imgUrl = self.url_manager.getNewUrl()
                self.download_Img(imgUrl)
                self.url_manager.addOldUrl(imgUrl)
    
    
    '''
    url仓库,提供url更新以及记录功能
    '''
    
    
    class UrlManager:
        def __init__(self):
            self.url_used = []
            self.url_target = queue.Queue()
            if os.path.exists('url.txt'):
                with open('url.txt', 'r') as f:
                    for eachline in f.readlines():
                        self.url_used.append(eachline.strip())
            else:
                open("url.txt", 'w')
    
        def getNewUrl(self):
            return self.url_target.get()
    
        def isEmpty(self):
            return self.url_target.empty()
    
        def addNewUrl(self, newUrl):
            if newUrl in self.url_used:
                pass
            else:
                self.url_target.put(newUrl)
    
        def addOldUrl(self, oldUrl):
            self.url_used.append(oldUrl)
            with open('url.txt', 'a') as f:
                f.write(oldUrl + '
    ')
    
    
    if __name__ == '__main__':
        num_list= [i for i in range(48)]
        res_l = []
        p = Pool()
        for i in num_list:
            res = p.apply_async(main, args=(int(i),))
            res_l.append(res)
        for k in res_l:
            res = k.get()
            print('下载妹子(%s)'%k)
    

      

    基于多线程,多进程(并且屏蔽了所有的错误,可以在上面扩展),谢谢!

  • 相关阅读:
    tcpdump 命令的常用选项:一
    Centos系统中 Systemd 的Unit文件配置说明
    如何使用PowerShell获取物理磁盘的信息
    tcpdump 命令的常用选项:二
    Google报告:大量被入侵的 Google Cloud 实例被用来挖掘加密货币
    Ubuntu中使用pdftk合并、分离PDF文档等操作
    tcpdump 命令的常用选项:三
    优麒麟Ubuntu Kylin 20.04 Pro SP1 上线
    为SSH登录设置电子邮件提醒
    图片上传并显示(兼容ie),图片大小判断
  • 原文地址:https://www.cnblogs.com/rianley/p/9254595.html
Copyright © 2011-2022 走看看