zoukankan      html  css  js  c++  java
  • datawhale爬虫task04

    # 实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容。
    #
    # 丁香园论坛:http://www.dxy.cn/bbs/thread/626626#626626 。
    # 丁香园用户名:xxxx
    # 密码:ABcd1234
    
    from selenium import webdriver
    import time
    from lxml import etree
    class DingxiangyuanLogin():
        def run(self):
            # 1.请求头:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
            #2. 添加请求头信息
            options = webdriver.ChromeOptions()
            options.add_argument('user-agent="ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"')
            #3. 创建浏览器驱动
            chrome_driver = webdriver.Chrome(options=options)
    
            #4. 登录账号
            #4.1 访问登录界面
            url = "https://auth.dxy.cn/accounts/login?"
            chrome_driver.get(url=url)
            time.sleep(3)
            #4.2 定位电脑登录按钮:
            pc_login_selec_button = chrome_driver.find_element_by_class_name('login__tab_wp').find_elements_by_tag_name('a')[1]
            #4.3 点击进入电脑登录界面:
            pc_login_selec_button.click()
            time.sleep(3)
            #4.4 定位“用户名”输入框
            user_name_box = chrome_driver.find_element_by_name('username')
            #4.5 输入用户名:
            user_name_box.send_keys('xxxx')
            #4.6 定位“密码”输入框
            code_box = chrome_driver.find_element_by_name('password')
            #4.7 输入密码
            code_box.send_keys('ABcd1234')
            #4.8 定位登录按钮
            login_button = chrome_driver.find_element_by_class_name('form__button')
            #4.9 点击登录按钮
            login_button.click()
    
            #5. 访问目标帖子界面
            chrome_driver.get('http://www.dxy.cn/bbs/thread/626626#626626 ')
    
            #获取帖子网页源代码
            reply_list = []
            response_data = chrome_driver.page_source
            #使用Xpath解析内容
            xpath_data = etree.HTML(response_data)
            # 获取所有回复节点
            # starts-with(@title,"注册时间")
            replies = xpath_data.xpath('//div[starts-with(@id, "post_")]')
            print("replies: " + str(replies))
            # print(replies)
            for reply in replies:
                reply_dict = {}
                print('reply: ' + str(reply))
                # 回复人姓名:
                auth_name = reply.xpath('.//div[@class="auth"]')[0].xpath('string(.)')
                # print('auth_name: ' + str(auth_name))
                # 级别
                auth_rank = reply.xpath('.//div[@class="info clearfix"]')[0].xpath('string(.)').strip()
                print("auth_rank: " + str(auth_rank))
                # 回复内容
                reply_content = reply.xpath('.//td[@class="postbody"]')[0].xpath('string(.)').strip()
                print('reply_content: ' + str(reply_content))
                reply_dict['auth_name'] = auth_name
                reply_dict['auth_rank'] = auth_rank
                reply_dict['reply_content'] = reply_content
                reply_list.append(reply_dict)
    
    
    
    
    DingxiangyuanLogin().run()
  • 相关阅读:
    QT *.pri 语法学习
    qt 的相对路径说法
    openwrt 时间更新
    openwrt network 初始化
    在线配置热加载配置 go-kratos.dev 监听key 通过atomic.Value支持自动热加载
    Monkey patching
    UDP flood UDP Port Denial-of-Service Attack
    一例 Go 编译器代码优化 bug 定位和修复解析
    t
    golang 网络编程之如何正确关闭tcp连接以及管理它的生命周期 socket
  • 原文地址:https://www.cnblogs.com/tommyngx/p/11343195.html
Copyright © 2011-2022 走看看