zoukankan      html  css  js  c++  java
  • 爬虫之selenium

    概述

    selenium模块的作用通过编写代码模拟人工对浏览器的事件,触发相关操作,从而获取网页信息,相对于使用requests模块,selenium模块对动态数据的爬取更为方便

    安装selenium:pip install selenium -i https://pypi.douban.com/simple

    使用:

    1:使用内置的webdriver类实例化一个浏览器对象diver(示例浏览器对象时,需要传入一个浏览器驱动的路径)如实例化一个谷歌浏览器地下diver = webdriver.Chrome(r'./chromedriver.exe)

    2:才用实例化的对象中的方法进行模拟人工操作浏览器

    常用内置方法:

    打开网页:diver.get("要访问的url")

    查询标签:diver.find_element_by_id("id值") # 根据标签id查找,可以修改为class,tagname等值,与js查找标签类似,找到标签返回一个obj对象

      obj对象的方法:  obj.click(点击)

              obj.send_keys(输入)

    获取网页源码:diver.page_source

    关闭浏览器:diver.close()/diver.quit

    执行js代码:diver.execute_script("js代码") # 如widow.scrollTo(0,document.body.scrollHeight放到js代码中,浏览器会执行滚轮下滑一定高度的动作,

    截图:diver.save_screenshot("图片保存路径和文件名")

    前进:diver.forward()

    后退:diver.back()

    切换到iframe标签:diver.switch_to.frame("iframe标签") 

    实现是鼠标按住不松手:线实例化动作链对象action = ActionChains(diver) # ActionChains从selenium.webdriver中导入

              然后保持不松开:action.click_and_hold(“标签对象”) #点击标签对象松开

              移动标签:action.move_by_offset(x,y) #移动标签对象,如果是移动到另一个标签里,可以使用action.drag_and_drop(被移动标签对象, 目标标签对象)

              执行上述代码:action.perform

    获取cookise值:diver.get_cookies()

    使用无头浏览器

    通过添加参数可以让selenium操作浏览器在后台运行,不会有界面显示

    # 创建一个参数对象,用来控制chrome以无界面模式打开
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')

    # 实例化一个谷歌浏览器对象, 需要加上一个谷歌无界面浏览器的参数chrome_options
    diver = webdriver.Chrome(r'./chromedriver.exe',chrome_options=chrome_options)

    规避检测

    #通过添加参数可以降低被网站服务器检测为自动化程序的风险

    # 实例化一个options对象, 添加规避被检测识别的参数
    options = ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=options)

    示例

    # 使用selenium爬取网易新闻里面["国内", "国际", "军事", "航空"]四个版块里面的新闻数据

     1 # 使用selenium爬取网易新闻里面["国内", "国际", "军事", "航空"]四个版块里面的新闻数据
     2 import requests,random
     3 from selenium import webdriver
     4 from selenium.webdriver import ChromeOptions
     5 from selenium.webdriver.chrome.options import Options
     6 from lxml import etree
     7 from multiprocessing.dummy import Pool
     8 
     9 # 设置不打开浏览器查询
    10 chrome_options = Options()
    11 chrome_options.add_argument("--headless")
    12 chrome_options.add_argument("--disable-gpu")
    13 
    14 # 规避脚本检测
    15 options = ChromeOptions()
    16 options.add_experimental_option('excludeSwitches', ['enable-automation'])
    17 # 生成谷歌浏览器对象
    18 diver = webdriver.Chrome('chromedriver.exe',chrome_options=chrome_options,options=options)
    19 # 链接目标url
    20 diver.get("https://news.163.com")
    21 # 获取网页代码
    22 response_text = diver.page_source
    23 
    24 #使用lxml解析源代码
    25 tree = etree.HTML(response_text)
    26 guonei_url = tree.xpath('//li[@class="menu_guonei"]/a/@href')[0]
    27 guoji_url = tree.xpath('//li[@class="menu_guoji"]/a/@href')[0]
    28 war_url = tree.xpath('//li[@class="menu_war"]/a/@href')[0]
    29 hangkong_url = tree.xpath('//li[@class="menu_hangkong"]/a/@href')[0]
    30 diver.close()
    31 
    32 
    33 def get_new(url):
    34     '''模仿人工操作浏览器下拉到页面底部,并获取整张页面源码'''
    35     new_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options)
    36     new_diver.get(url)
    37     js = 'window.scrollTo(0,document.body.scrollHeight)'
    38     check_bottom = new_diver.find_element_by_class_name("load_more_tip")
    39     while check_bottom.get_attribute('style') == 'display: none;':
    40         new_diver.execute_script(js)
    41         obj = new_diver.find_element_by_class_name("post_addmore")
    42         if obj.get_attribute('style') == 'visibility: visible;':
    43             obj.click()
    44     new_diver.execute_script(js)
    45     # 获取网页代码
    46     response_text = new_diver.page_source
    47 #     filename = str(random.randint(1000,9999)) + ".html"
    48 #     with open(filename,"w",encoding="utf-8") as f:
    49 #         f.write(response_text)
    50     new_diver.close()
    51     return response_text
    52 
    53 def mark_url(html_text):
    54     '''获取各个新闻的详情页标签'''
    55     mark_tree = etree.HTML(html_text)
    56     title_url_list = mark_tree.xpath('//div[@class="ndi_main"]/div/div/div/h3/a/@href')
    57     return title_url_list
    58 
    59 def get_new_detail(title_url_list):
    60     '''爬取并将新闻标题个内容保存在本地'''
    61     filename = str(random.randint(1000,9999)) + ".txt"
    62     with open(filename,"w",encoding="utf-8") as f:
    63         for title_url in title_url_list:
    64             detail_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options)
    65             detail_diver.get(title_url)
    66             response_text = detail_diver.page_source
    67             detail_tree = etree.HTML(response_text)
    68             title = detail_tree.xpath('//div[@id="epContentLeft"]/h1/text()')[0]
    69             text = detail_tree.xpath('//div[@id="endText"]/p/text()')
    70             text = ''.join(text)
    71             f.write(title)
    72             f.write(text)
    73 
    74 # 初始化四个要爬取的网页url
    75 url_list = [guonei_url,guoji_url,war_url,hangkong_url]
    76 # 实例化线程池
    77 pool = Pool(4)
    78 # 使用线程池获取要爬取网页的所有新闻标题和新闻详情页的url
    79 data_list = pool.map(get_new,url_list)
    80 # 解析所有详情页的url
    81 title_url_list = pool.map(mark_url,data_list)
    82 # 爬取新闻详情
    83 pool.map(get_new_detail,title_url_list)
    View Code

    # 使用线程池爬取梨视频app的视频

    # 使用线程池爬取梨视频(10个视频)
    import requests,re,random
    from lxml import etree
    from multiprocessing.dummy import Pool
    requests = requests.Session()
    url = 'https://www.pearvideo.com/category_4'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
        "connection":"close"
    }
    
    response_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(response_text)
    video_url_list = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href | //ul[@id="categoryList"]/li/div/a/@href')
    
    for i in range(len(video_url_list)):
        video_url_list[i] = 'https://www.pearvideo.com/' + video_url_list[i] 
    
    '''
    srcUrl="https://video.pearvideo.com/mp4/third/20191023/cont-1615387-11549790-203859-hd.mp4",vdoUrl=srcUrl,
    
    '''
    def get_data_url(url):
        response_text = requests.get(url=url,headers=headers).text
        data_url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,',response_text)[0]
        return data_url
        
    def get_data(data_url):
        data = requests.get(url=data_url,headers=headers).content
        filename = str(random.randint(1000,9999)) + ".mp4"
        with open(filename,"wb") as f:
            f.write(data)
        
    
    
    pool = Pool(5)
    data_url_list = pool.map(get_data_url,video_url_list)
    pool.map(get_data,data_url_list)
    View Code
  • 相关阅读:
    scala中的注解
    scala中的表达式
    scala中枚举
    spark sql建表的异常
    hive和sequoiadb对接的问题
    java IO的总结
    Spark的序列化
    pentaho和spark-sql对接
    英语口语练习系列-C28-海滨-辨别身份-悬崖边的树
    2018-12-4-今日总结
  • 原文地址:https://www.cnblogs.com/mark--ping/p/11773420.html
Copyright © 2011-2022 走看看