zoukankan      html  css  js  c++  java
  • 简单小练习_微博爬取

    # start_chrome -> input_date -> scroll_down-> find_cards_info -> save -> find_next (goto)
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    import csv
    import os
    
    # 运行前先下载 chrome driver,下载地址是:https://sites.google.com/a/chromium.org/chromedriver/downloads,点击【Latest Release: ChromeDriver x.xx】进入下载
    
    def start_chrome():
        driver = webdriver.Chrome(executable_path='./chromedriver')  # Windows 需写成'./chromedriver.exe'
        driver.start_client()
        return driver
    # weibo.com/sxbg? + start_t ...
    
    def q(st,et):
        return f'?is_ori=1&key_word=&start_time={st}&end_time={et}&is_search=1&is_searchadv=1#_0'
    
    def scroll_down():
        html_page = driver.find_element_by_tag_name('html')
        # ...
        # form > input
        for i in range(15):
            print(i)
            html_page.send_keys(Keys.END)
            time.sleep(0.6)
    
    def find_cards_info():
        cards_sel = 'div.WB_feed_detail'
        cards     = driver.find_elements_by_css_selector(cards_sel)
        info_list = []
    
        for card in cards:
            content_sel = 'div.WB_text.W_f14'
            time_sel    = 'div.WB_from.S_txt2'
            link_sel    = 'div.WB_from.S_txt2 > a:nth-child(1)'
    
            content     = card.find_element_by_css_selector(content_sel).text
            time        = card.find_element_by_css_selector(time_sel).text
            link        = card.find_element_by_css_selector(link_sel).get_attribute('href')
    
            info_list.append([content,time,link])
            # [[1,2,3],[4,5,6]...]
        return info_list
    
    def find_next():
        next_sel  = 'a.page.next'
        next_page = driver.find_elements_by_css_selector(next_sel)
        if next_page:
            return next_page[0].get_attribute('href')
    
    def save(info_list,name):
        full_path = './' + name + '.csv' # 2018-01-02~2018-03-05.csv
        if os.path.exists(full_path):
            with open(full_path,'a') as f:
                writer = csv.writer(f)
                writer.writerows(info_list)
                print('Done')
        else:
            with open(full_path,'w+') as f:
                writer = csv.writer(f)
                writer.writerows(info_list)
                print('Done')
    
    
    def run_crawler(base,duration):
        # 2018-01-02~2018-03-05
        if not base.endswith('feedtop'):
            st, et = duration.split('~')
            driver.get(base+q(st,et))
        else:
            driver.get(base)
        time.sleep(5)
        scroll_down()
        time.sleep(5)
        info_list = find_cards_info()
        save(info_list,duration)
        next_page = find_next()
        if next_page:
            run_crawler(next_page,duration)
    
    base = 'https://weibo.com/erick'   #这里可以替换成你想爬取的微博用户
    driver = start_chrome()
    input()
    run_crawler(base, '2017-06-20~2018-02-03')   #这里可以替换成你想爬取的时间段
  • 相关阅读:
    服务器架构前面加了防火墙,Nginx如何获取客户端真实ip???
    Prometheus学习笔记(5)Grafana可视化展示
    Prometheus学习笔记(4)什么是pushgateway???
    Prometheus学习笔记(3)什么是node_exporter???
    Prometheus学习笔记(2)Prometheus部署
    Prometheus学习笔记(1)Prometheus架构简介
    Centos 7 中的ulimit -n 65535 对进程的文件句柄限制不生效??
    Jenkins实用发布与回滚PHP项目生产实践
    Ansible入门笔记(3)之Playbook
    Ansible入门笔记(2)之常用模块
  • 原文地址:https://www.cnblogs.com/Erick-L/p/9253418.html
Copyright © 2011-2022 走看看