zoukankan      html  css  js  c++  java
  • Selenium爬取网站数据

    selenium爬取网站数据

    调用Chrome浏览器

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    import logging
    from urllib.parse import urljoin
    
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
    
    INDEX_URL = 'https://dynamic2.scrape.center/page/{page}'
    TIME_OUT = 20
    TOTAL_PAGE = 10
    browser = webdriver.Chrome()
    # options = webdriver.ChromeOptions()
    # options.add_argument('--headless')
    # browser = webdriver.Chrome(options=options)
    wait = WebDriverWait(browser, TIME_OUT)
    
    
    def scrape_page(url, condition, locator):
        logging.info('scraping %s', url)
    
        try:
            browser.get(url)
            wait.until(condition(locator))
        except TimeoutException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
    
    
    def scrape_index(page):
        url = INDEX_URL.format(page=page)
        scrape_page(url, condition=EC.visibility_of_all_elements_located, locator=(By.CSS_SELECTOR, '#index .item'))
    
    
    def parse_index():
        elements = browser.find_elements_by_css_selector('#index .item .name')
        for element in elements:
            href = element.get_attribute('href')
            yield urljoin(INDEX_URL, href)
    
    
    def scrape_detail(url):
        scrape_page(url, condition=EC.visibility_of_element_located, locator=(By.TAG_NAME, 'h2'))
    
    
    def parse_detail():
        url = browser.current_url
        name = browser.find_element_by_tag_name('h2').text
        categories = [element.text for element in browser.find_elements_by_css_selector('.categories button span')]
        cover = browser.find_element_by_css_selector('.cover').get_attribute('src')
        score = browser.find_element_by_class_name('score').text
        drama = browser.find_element_by_css_selector('.drama p').text
        return {
            'url': url,
            'name': name,
            'categories': categories,
            'cover': cover,
            'score': score,
            'drama': drama
        }
    
    
    from os import makedirs
    from os.path import exists
    import json
    RESULTS_DIR = 'results'
    exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
    
    
    def save_data(data):
        name = data.get("name")
        data_path = f'{RESULTS_DIR}/{name}.json'
        json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
    
    
    def main():
        try:
            for page in range(1, TOTAL_PAGE + 1):
                scrape_index(page)
                detail_urls = parse_index()
                for detail_url in list(detail_urls):
                    logging.info('get detail url %s', detail_url)
                    scrape_detail(detail_url)
                    detail_data = parse_detail()
    
                    save_data(detail_data)
    
                    logging.info('detail data %s', detail_data)
        finally:
            browser.close()
    
    
    if __name__ == '__main__':
        main()

    来自拉勾教育《52讲轻松搞定爬虫》

    另一种pyppeteer实现方法参考:https://www.cnblogs.com/zhzhang/p/15152807.html

    谢谢

  • 相关阅读:
    腾讯游戏是如何使用Docker的
    为Elasticsearch添加中文分词,对比分词器效果
    13个对web设计师有用的JavaScript插件
    七牛是如何搞定每天500亿条日志的
    七牛李道兵谈“架构坏味道”
    接口的解释
    数组
    Div Span
    dl dt dd
    @Razor解析器
  • 原文地址:https://www.cnblogs.com/zhzhang/p/15110608.html
Copyright © 2011-2022 走看看