zoukankan      html  css  js  c++  java
  • Selenium3+python3自动化(四十三)--爬取我的博客园粉丝的名称,并写入.text文件

     爬取目标

    1.爬取目标网站:我的博客:https://home.cnblogs.com/u/canglongdao/followers/

    爬取内容:爬取我的博客的所有粉丝的名称,并保存到txt

    3.由于博客园的登录时需要人机验证的,所以无法直接使用账号自动登录

    4.可以先使用selenium代码,在需要输入验证码处,停留几秒,手动验证

    5.获取登录成功的cookies,并复制保持登录相关的cookie(字典格式)

    代码如下:

    # coding:utf-8
    from selenium import webdriver
    import time
    driver=webdriver.Chrome()
    driver.get("https://account.cnblogs.com/signin")
    nlc=driver.get_cookies()
    print(len(nlc),nlc)
    driver.find_element_by_id("mat-input-0").send_keys("xxx@qq.com")
    driver.find_element_by_id("mat-input-1").send_keys("P@ssw0rd")
    driver.find_element_by_xpath("//span[@class='mat-button-wrapper']").click()
    time.sleep(6)
    lc=driver.get_cookies()
    print(len(lc),lc)
    

     运行结果:

    5 [{'domain': 'account.cnblogs.com', 'expiry': 1599210767, 'httpOnly': False, 'name': '4271c12252a544478175bac9772afc3d', 'path': '/', 'secure': False, 'value': '010720fb-f7e4-4f4b-b206-0e991ecf6f5b'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'SERVERID', 'path': '/', 'secure': False, 'value': 'daace45bf36fef87f4742d8b633fdae3|1599208967|1599208966'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Session', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF%2BQEQcGP%2Bk0zUGAelDZwKkZE07wn7bgYbw56biK9%2FwoxKcs%2FmFFb%2B21xAjYxIXXQJeai7NvLoyDfgSr45CxhE9nwRKokI1nqtUdlD5wk2MHtHUO4kIFOTpe9gzKU%2F%2BDs%2B65eSMPAU62bfOS86QdUoNXH5qL'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'XSRF-TOKEN', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcyb192CHuZwpo_t9r1Ps07m_GVYNh15x2atqF3hGcynCnlxxqVFCWmUT5OqBV0zfYfYC3BjZ-7WUDux6AI1xLaMad3ETT6_MyakbxByaS76Nim_y5-i1_oX0aBl2U91xs'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Antiforgery', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQek9XiBoWIQkti8vvTbpqx-CFIWKb39vrCeVudMwHbcPXBWb8LBrlnlM0JzKwWlUlgaD5ioMqre_sd1nEFtrTGhAMmUsVWYxYta1gs4DkuYVinqEL6omAaSnZIJhoxLfp8'}]
    5 [{'domain': '.cnblogs.com', 'expiry': 1599295374, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.683535015.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1662280974, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1985506889.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'}, {'domain': '.cnblogs.com', 'expiry': 1599209034, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'}]
    

     添加登录的cookie,并获取粉丝名称

    # coding:utf-8
    from selenium import webdriver
    import time
    driver=webdriver.Chrome()
    
    #手动从登陆后,获取到的cookie中,复制如下内容,赋值给c1,c2
    c1={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'}
    c2={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'}
    driver.get("https://account.cnblogs.com/signin")
    driver.add_cookie(c1)
    driver.add_cookie(c2)
    time.sleep(3)
    driver.get("https://home.cnblogs.com/u/canglongdao/followers/")
    f=driver.find_elements_by_xpath("//div[@class='avatar_list']/ul/li/a")
    result=[]
    for i in f:
        name=i.get_attribute("title")
        print(name)
        result.append(name)
    print(result)
    

     运行结果:

    ['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', '']
    

     将粉丝名称写入.txt文档

    # rs=['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', '']
    # print(rs[::2])
    for i in result[::2]:
        with open("a.txt","a") as f: #追加#无a.txt,则新建;a.txt存在,则追加
            f.write(i+"
    ")
    

     

     

    参考代码:

    # coding:utf-8
    from selenium import webdriver
    import time
    driver=webdriver.Chrome()
    # driver.get("https://account.cnblogs.com/signin")
    # nlc=driver.get_cookies()
    # print(len(nlc),nlc)
    # driver.find_element_by_id("mat-input-0").send_keys("xxx@qq.com")
    # driver.find_element_by_id("mat-input-1").send_keys("P@ssw0rd")
    # driver.find_element_by_xpath("//span[@class='mat-button-wrapper']").click()
    # time.sleep(6)
    # lc=driver.get_cookies()
    # print(len(lc),lc)
    # 5 [{'domain': 'account.cnblogs.com', 'expiry': 1599210767, 'httpOnly': False, 'name': '4271c12252a544478175bac9772afc3d', 'path': '/', 'secure': False, 'value': '010720fb-f7e4-4f4b-b206-0e991ecf6f5b'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'SERVERID', 'path': '/', 'secure': False, 'value': 'daace45bf36fef87f4742d8b633fdae3|1599208967|1599208966'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Session', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF%2BQEQcGP%2Bk0zUGAelDZwKkZE07wn7bgYbw56biK9%2FwoxKcs%2FmFFb%2B21xAjYxIXXQJeai7NvLoyDfgSr45CxhE9nwRKokI1nqtUdlD5wk2MHtHUO4kIFOTpe9gzKU%2F%2BDs%2B65eSMPAU62bfOS86QdUoNXH5qL'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'XSRF-TOKEN', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcyb192CHuZwpo_t9r1Ps07m_GVYNh15x2atqF3hGcynCnlxxqVFCWmUT5OqBV0zfYfYC3BjZ-7WUDux6AI1xLaMad3ETT6_MyakbxByaS76Nim_y5-i1_oX0aBl2U91xs'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Antiforgery', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQek9XiBoWIQkti8vvTbpqx-CFIWKb39vrCeVudMwHbcPXBWb8LBrlnlM0JzKwWlUlgaD5ioMqre_sd1nEFtrTGhAMmUsVWYxYta1gs4DkuYVinqEL6omAaSnZIJhoxLfp8'}]
    # 5 [{'domain': '.cnblogs.com', 'expiry': 1599295374, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.683535015.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1662280974, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1985506889.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'}, {'domain': '.cnblogs.com', 'expiry': 1599209034, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'}]
    # #
    #手动从登陆后,获取到的cookie中,复制如下内容,赋值给c1,c2
    c1={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'}
    c2={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'}
    driver.get("https://account.cnblogs.com/signin")
    driver.add_cookie(c1)
    driver.add_cookie(c2)
    time.sleep(3)
    driver.get("https://home.cnblogs.com/u/canglongdao/followers/")
    f=driver.find_elements_by_xpath("//div[@class='avatar_list']/ul/li/a")
    result=[]
    for i in f:
        name=i.get_attribute("title")
        print(name)
        result.append(name)
    print(result)
    
    # rs=['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', '']
    # print(rs[::2])
    for i in result[::2]:
        with open("a.txt","a") as f:
            f.write(i+"
    ")
    

     运行结果:

    偏爱也例外
    
    岑欢
    
    NiuBiBoy!
    
    知识在于点滴的积累
    
    浅唱蛰伏
    
    linofficer
    
    龙骑士大哥
    
    给明天的自己
    
    小熊软糖
    
    ['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', '']
    

      

    越努力,越幸运!!! good good study,day day up!!!
  • 相关阅读:
    将十进制的颜色制转换成ARGB
    HTTPS从认识到线上实战全记录
    如何从零开始对接第三方登录(Java版):QQ登录和微博登录
    JS弹出下载对话框以及实现常见文件类型的下载
    【干货】Chrome插件(扩展)开发全攻略
    Lucene5.5.4入门以及基于Lucene实现博客搜索功能
    ReactNative与NativeScript对比报告
    JavaScript常见原生DOM操作API总结
    JS获取剪贴板图片之后的格式选择与压缩问题
    详细记录一下网站备案经过,备案真的很简单
  • 原文地址:https://www.cnblogs.com/canglongdao/p/13614415.html
Copyright © 2011-2022 走看看