zoukankan      html  css  js  c++  java
  • 【Python3】【爬虫】bilibili摄影板块

    0x00准备

    1. B站的小姐姐炒鸡漂亮(逃,这个真的是技术贴。
    2. 第一次抓动态网站:原理是直接模拟浏览器访问。

    0x01环境

    1.python3不说了
    2.selenium安装报错解决方案:

    安装:
    pip install selenium
    
    报错1:
    During handling of the above exception, another exception occurred:
    安装chromedirve解决。
    
    报错2:
    FileNotFoundError: [WinError 2] 系统找不到指定的文件。
    
    from selenium import webdriver
    #driver = webdriver.Chrome("C:Development&GwjEnvironmentchromedriver.exe")  #错误的路径 使用正斜杠
    driver = webdriver.Chrome("C:/Development/&GwjEnvironment/chromedriver.exe") #正确的路径 使用反斜杠
    driver.get("http://www.baidu.com")
    
    更正:
    找到并且全部改成shell=True
    C:DevelopmentPython36Libsubprocess.py

    3.要装对应浏览器的调试工具。
    比如我的chrome就是chromedriver_win32
    4.需要的库自己看代码里面。

    0x02丢图跑

    好吧这个真的不是技术贴。

    代码2(用js):

    # 抓取了B站cosplay热门摄影
    import os
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import ssl
    from time import sleep
    import requests
    import random
    import re
    import json
    
    UserAgent_List = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    headers = {'User-Agent': random.choice(UserAgent_List),
               'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
               'Accept-Encoding': 'gzip',
               }
    base_url = 'https://h.bilibili.com/eden/picture_area#/cos/hot'
    pic_save_path = "output/"
    
    # 打开浏览器模拟请求
    def browser_get(pageNum):
        browser = webdriver.Chrome()
        browser.get(base_url)
        h = int(int(pageNum)/20)
        for i in range(h):
            browser.execute_script("window.scrollBy(0,3000)")
            sleep(2)
        html_text = browser.page_source
        soup = BeautifulSoup(html_text,'html.parser')
        urls = soup.find('div',{'class':'area-wrapper'}).findAll('h3',{'class':'article-title'})
        # print(len(urls))
        count = 1
        for url in urls:
            test = 'https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id='+re.sub('D',"",url.a['href']) #regex初体验
            browser.get(test)
            js = browser.page_source # 怎么直接得到json???
            # print(js)
            sleep(2) # 异步加载
            get_meizi_url(js)
            count += 1
            if count>int(pageNum):break
        browser.quit()
    
    # 获取每个页面的小姐姐
    def get_meizi_url(js):
        # print(js)
        soup = BeautifulSoup(js, 'html.parser')
        text = soup.find('pre').string
        hhh = json.loads(text)
        title = ""
        for i in hhh['data']['item']['title']:
            title = title+i
        if not os.path.exists(title):
            os.makedirs(title)
        else:
            return 
        print(title)
        count = 1
        for i in hhh['data']['item']['pictures']:
            print(i['img_src'])
            qaq = re.search(r'(jpg)|(webp)|(png)|(jpeg)',i['img_src'])
            filename = '%s/%s/%s.%s'%(os.path.abspath('.'),title,count,qaq.group())
            with open(filename,'wb+')as qwq:
                qwq.write(requests.get(i['img_src'],headers=headers).content)
            count += 1
        return 
    
    if __name__ == '__main__':
        ssl._create_default_https_context = ssl._create_unverified_context #https问题
        pageNum = input(u'请问你要几份小姐姐照片:')
        #if not os.path.exists(pic_save_path):
        #    os.makedirs(pic_save_path)
        browser_get(pageNum)

    代码1(纯html):

    # 抓取了B站cosplay热门摄影
    # 72行有个BUG,B站格式随机,需要正则表达式匹配,占坑待填。
    import os
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import ssl
    from time import sleep
    import requests
    import random
    
    UserAgent_List = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    headers = {'User-Agent': random.choice(UserAgent_List),
               'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
               'Accept-Encoding': 'gzip',
               }
    base_url = 'https://h.bilibili.com/eden/picture_area#/cos/hot'
    pic_save_path = "output/"
    
    # 打开浏览器模拟请求
    def browser_get(pageNum):
        browser = webdriver.Chrome()
        browser.get(base_url)
        h = int(int(pageNum)/20)
        for i in range(h):
            browser.execute_script("window.scrollBy(0,3000)")
            sleep(2)
        html_text = browser.page_source
        soup = BeautifulSoup(html_text,'html.parser')
        urls = soup.find('div',{'class':'area-wrapper'}).findAll('h3',{'class':'article-title'})
        # print(len(urls))
        count = 1
        for url in urls:
            browser.get('https://'+url.a['href'])
            sleep(2) # 异步加载
            html = browser.page_source
            get_meizi_url(html)
            count += 1
            if count>int(pageNum):break
        browser.quit()
    
    # 获取每个页面的小姐姐
    def get_meizi_url(html):
        # print(html)
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find('h1',attrs={'class':'article-title dp-i-block v-middle'}).string
        if not os.path.exists(title):
            os.makedirs(title)
        print(title)
        href = soup.find('div',attrs={'class':'images'}).findAll('img')
        count = 1
        for a in href:
            print(a['src'])
            filename = '%s/%s/%s.jpg'%(os.path.abspath('.'),title,count)
            with open(filename,'wb+')as qwq:
                qwq.write(requests.get(a['src'],headers=headers).content)
            count += 1
    
    if __name__ == '__main__':
        ssl._create_default_https_context = ssl._create_unverified_context #https问题
        pageNum = input(u'请问你要几份小姐姐照片:')
        #if not os.path.exists(pic_save_path):
        #    os.makedirs(pic_save_path)
        browser_get(pageNum)    

    ok

    反正我偷图跑路成功了,代码挂了联系博主,欢迎大佬带找js。

  • 相关阅读:
    剑指offer 合并两个排序的链表
    剑指offer 把字符串转换成整数
    剑指offer 数组中重复的数字
    剑指offer 数字在排序数组中出现的次数
    剑指offer 替换空格
    go学习笔记-错误处理
    go学习笔记-类型转换(Type Conversion)
    go学习笔记-语言指针
    go学习笔记-包处理
    go学习笔记-面向对象(Methods, Interfaces)
  • 原文地址:https://www.cnblogs.com/gwj1314/p/9444888.html
Copyright © 2011-2022 走看看