zoukankan      html  css  js  c++  java
  • python3爬虫例子01(获取个人博客园的粉丝)

    #!/usr/bin/env python
    # -*- coding:UTF-8 -*-

    import requests
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import re
    import time


    class GetFansName:
    #初始化各配置项数值
    def __init__(self, profiles, url, ses, sleepTime, fansNameFile):
    self.profiles = profiles
    self.url = url
    self.ses = ses
    self.sleepTime = sleepTime
    self.fansNameFile = fansNameFile

    def get_cookies(self):
    try:
    #加载配置文件
    profiles = webdriver.FirefoxProfile(self.profiles)

    driver = webdriver.Firefox(profiles)

    driver.get(self.url+"/followers")

    time.sleep(self.sleepTime)

    #获取COOKIES
    cookies = driver.get_cookies()

    # print(cookies)

    driver.quit()

    return cookies
    except Exception as msg:
    print("get_cookies error:%s"%str(msg))


    def add_cookies(self,cookies):
    try:
    c=requests.cookies.RequestsCookieJar()
    for i in cookies:
    c.set(i["name"],i["value"])

    #更新COOKIES
    self.ses.cookies.update(c)
    except Exception as msg:
    print("add_cookies error:%s"%str(msg))


    def get_fansNum(self):
    try:
    #发送访问粉丝的请求
    fansres = self.ses.get(self.url+"/relation/followers")

    fanssoup = BeautifulSoup(fansres.content,"html.parser")

    #获取粉丝数量
    tempfansnum = fanssoup.find_all(class_="current_nav")

    # print(tempfansnum[0].string)

    strfansnum = re.findall(u"我的粉丝((.+?))",tempfansnum[0].string)
    print(u"我的粉丝数量:%s"%str(strfansnum[0]))

    #粉丝分页数量
    fansnum = int(int(strfansnum[0])/45)+1

    print(u"总的分页:%s"%str(fansnum))

    return fansnum
    except Exception as msg:
    print("get_fansNum error:%s"%str(msg))
    return 1


    def get_fansName(self,fansnum):
    try:
    #判断有几页粉丝,然后分别去处理
    if fansnum <=1:
    url_page=self.url+"/relation/followers"
    else:
    url_page=self.url+"/relation/followers?page=%s"%str(fansnum)

    print("正在抓取页面:%s"%url_page)

    fansnameres=self.ses.get(url_page,verify=False)

    fansnamesoup=BeautifulSoup(fansnameres.content,"html.parser")

    fansnames=fansnamesoup.find_all(class_="avatar_name")

    #将粉丝名字写入文件
    for fansname in fansnames:
    name=fansname.string.replace(" "," ").strip(" ")

    with open(self.fansNameFile,'a',encoding="utf-8") as file:
    file.write(name+" ")
    except Exception as msg:
    print("get_fansName error:%s"%str(msg))


    if __name__ == '__main__':

    #FireFox profile文件路径
    profiles = r"C:UsersAdministratorAppDataRoamingMozillaFirefoxProfileswv0f79j4.default"

    #要抓取的粉丝的URL
    url = "https://home.cnblogs.com/u/NiceTime"

    #存放粉丝名字的文件
    fansNameFile = "fansNameFile.txt"

    #打开浏览器后,等待的时间,单位秒
    sleepTime = 5

    #获取当前请求的会话
    ses = requests.session()

    fansName = GetFansName(profiles, url, ses,sleepTime,fansNameFile)

    cookies = fansName.get_cookies()

    fansName.add_cookies(cookies)

    fansNums = fansName.get_fansNum()

    for fansNum in range(1, fansNums+1):
    fansName.get_fansName(fansNum)



  • 相关阅读:
    Relativity 01: Physical Meaning of Geometrical Propositions
    Algo 2: Asymptotic Order of Growth
    CShop Project : BeanUtils工具的使用
    137 __getattribute__
    134 isinstance和issubclass
    135 反射(hasattr和getattr和setattr和delattr)
    133 面向对象进阶实战之选课系统
    132 面向对象进阶小结
    131 类和对象的绑定方法及非绑定方法
    130 类的property特性
  • 原文地址:https://www.cnblogs.com/NiceTime/p/10070139.html
Copyright © 2011-2022 走看看