zoukankan      html  css  js  c++  java
  • Selenium爬取网站数据

    selenium爬取网站数据

    调用Chrome浏览器

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    import logging
    from urllib.parse import urljoin
    
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
    
    INDEX_URL = 'https://dynamic2.scrape.center/page/{page}'
    TIME_OUT = 20
    TOTAL_PAGE = 10
    browser = webdriver.Chrome()
    # options = webdriver.ChromeOptions()
    # options.add_argument('--headless')
    # browser = webdriver.Chrome(options=options)
    wait = WebDriverWait(browser, TIME_OUT)
    
    
    def scrape_page(url, condition, locator):
        logging.info('scraping %s', url)
    
        try:
            browser.get(url)
            wait.until(condition(locator))
        except TimeoutException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
    
    
    def scrape_index(page):
        url = INDEX_URL.format(page=page)
        scrape_page(url, condition=EC.visibility_of_all_elements_located, locator=(By.CSS_SELECTOR, '#index .item'))
    
    
    def parse_index():
        elements = browser.find_elements_by_css_selector('#index .item .name')
        for element in elements:
            href = element.get_attribute('href')
            yield urljoin(INDEX_URL, href)
    
    
    def scrape_detail(url):
        scrape_page(url, condition=EC.visibility_of_element_located, locator=(By.TAG_NAME, 'h2'))
    
    
    def parse_detail():
        url = browser.current_url
        name = browser.find_element_by_tag_name('h2').text
        categories = [element.text for element in browser.find_elements_by_css_selector('.categories button span')]
        cover = browser.find_element_by_css_selector('.cover').get_attribute('src')
        score = browser.find_element_by_class_name('score').text
        drama = browser.find_element_by_css_selector('.drama p').text
        return {
            'url': url,
            'name': name,
            'categories': categories,
            'cover': cover,
            'score': score,
            'drama': drama
        }
    
    
    from os import makedirs
    from os.path import exists
    import json
    RESULTS_DIR = 'results'
    exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
    
    
    def save_data(data):
        name = data.get("name")
        data_path = f'{RESULTS_DIR}/{name}.json'
        json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
    
    
    def main():
        try:
            for page in range(1, TOTAL_PAGE + 1):
                scrape_index(page)
                detail_urls = parse_index()
                for detail_url in list(detail_urls):
                    logging.info('get detail url %s', detail_url)
                    scrape_detail(detail_url)
                    detail_data = parse_detail()
    
                    save_data(detail_data)
    
                    logging.info('detail data %s', detail_data)
        finally:
            browser.close()
    
    
    if __name__ == '__main__':
        main()

    来自拉勾教育《52讲轻松搞定爬虫》

    另一种pyppeteer实现方法参考:https://www.cnblogs.com/zhzhang/p/15152807.html

    谢谢

  • 相关阅读:
    Minimum Depth of Binary Tree leetcode java
    Maximum Depth of Binary Tree leetcode java
    Symmetric Tree leetcode java
    Same Tree leetcode java
    Binary Tree Postorder Traversal leetcode java
    Binary Tree Preorder Traversal leetcode java
    Binary Tree Inorder Traversal leetcode java
    Combinations leetcode java
    一键清除Centos iptables 防火墙所有规则
    阿里云centos7.7x64安装open,并配置ip转发和nat伪装
  • 原文地址:https://www.cnblogs.com/zhzhang/p/15110608.html
Copyright © 2011-2022 走看看