zoukankan      html  css  js  c++  java
  • 爬虫之知乎用户信息爬取

    这个爬虫程序有别于之前写的两个,这个是自己写的,使用的是python库requests、redis、lxml。

    一共有三个文件分别是config.ini用户名和cookie配置文件,zhihusp.py爬取用户名,get-info.py爬取用户其他信息。

    下面分别将三个文件贴出来,两个python文件注释比较详细,应该都看得懂。

    config.ini
    [info]
    phone_num = 15*********
    password = ************
    
    [cookies]
    q_c1 = 5fd5e96aa1cc40f587e2fcaa621030ee|1448986627000|1448986627000
    cap_id = Zjk3N2I3MjU1ZmIyNGJkNWJIDOxYmE3ZDEzN2QyOGE=|1449289675|
    612bbfbnjd2e3bca76d397a2c67c921fe7c852b
    _za = b7e8ab32-03b3-473b-87e6-68fe9f9e7933
    __utmt = 1
    __utma = 51854390.1168696635.1449128833.1449239113.1449289659.5
    __utmb = 51854390.6.10.1449289659
    __utmc = 51854390
    __utmz = 51854390.1449223233.4.2.utmcsr=zhihu.coccn=(referral)|
    utmcmd=referral|utmcct=/people/excited-vczh/followers
    __utmv = 51854390.100-2|2=re=1^3=entry_date=20151202=1
    z_c0 = QUJDTXpzbTNGd2tYQUFBdffabXowaVZZdHBZbnJIS3FhYjZBQnRTWllWQlZ1T
    1kyc1dnPT0=|1449289708|7020f5e7c6c95b043e48c02afffb3a9c40035a77
    unlock_ticket = QUJDTXpzbTNGd2tYQUFBQVlRSlZUZlJ1WWxaUDlzRGpZTVocGdn
    Ul8xZkVNbDNBPT0=|1554289708|d906b57006b0cd84c58c4f6d6e1eb16e17e64

    zhihusp.py 主要用户从关注着列表抓取关注者id

      1 # -*- coding: utf-8 -*-
      2 '''
      3 网络爬虫之爬取知乎用户信息
      4 '''
      5 import requests, json, re, redis, sqlite3
      6 import ConfigParser
      7 from lxml import etree
      8 import sys
      9 reload(sys)
     10 sys.setdefaultencoding("utf-8")
     11 
     12 class ZhihuSpider(object):
     13     """docstring for ZhihuSpider"""
     14     r = redis.Redis(host='127.0.0.1',port=6379,db=1)
     15     cf = ConfigParser.ConfigParser()
     16     cf.read('config.ini')
     17     cookies = cf.items('cookies')
     18     cookies = dict(cookies)
     19     session = requests.session()
     20     conn = sqlite3.connect('zhihuuser.db')
     21     conn.text_factory = str
     22     cur = conn.cursor()
     23 
     24     # 创建链接,如果使用用户名、密码登录不上,则改用cookie登录
     25     def create_session(self):
     26         
     27         from pprint import pprint
     28         pprint(self.cookies)
     29         phone_num = self.cf.get('info', 'phone_num')
     30         password = self.cf.get('info', 'password')
     31         login_data = {'phone_num': phone_num, 'password': password}
     32         header = {
     33             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 
     34             AppleWebKit/537.36 
     35             (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',
     36             'Host': 'www.zhihu.com',
     37             'Referer': 'http://www.zhihu.com/'
     38         }
     39         r = self.session.post('http://www.zhihu.com/login/phone_num',
     40          data=login_data,
     41          headers=header)
     42         if r.json()['r'] == 1:
     43             print 'Login Failed, reason is:',
     44             for m in r.json()['data']:
     45                 print r.json()['data'][m]
     46             print 'So we use cookies to login in...'
     47             has_cookies = False
     48             for key in self.cookies:
     49                 if key != '__name__' and self.cookies[key] != '':
     50                     has_cookies = True
     51                     break
     52             if has_cookies is False:
     53                 raise ValueError('请填写config.ini文件中的cookies项.')
     54             else:
     55                 r=self.session.get('http://www.zhihu.com/login/phone_num',
     56                  cookies=self.cookies) # 实现cookie登陆
     57 
     58         with open('login.html', 'w') as fp:
     59             fp.write(r.content)
     60 
     61     # 请求用户关注者或关注了谁的页面
     62     def follow(self, userid):
     63         print "NOW Follow:",userid
     64         self.r.set(userid, False)
     65         follower_url = "http://www.zhihu.com/people/"+userid+"/followers"
     66         follower, followee, user_urls = self.getinfo(userid)
     67         # print user_urls
     68         for u_url in user_urls:
     69             userid = u_url.split('/')[-1]
     70             # print "FFFFFFFLLLLLLLL@*******",userid
     71             if self.not_in(userid):
     72                 self.r.set(userid, True)
     73         # print type(follower),follower
     74         if follower > 20:
     75             self.doprofiles(follower,follower_url)
     76 
     77         #提取关注的人的第一页的userid
     78         followee_url = "http://www.zhihu.com/people/"+userid+"/followees"
     79         response=self.session.get(followee_url,cookies=self.cookies)
     80             .content
     81         page = etree.HTML(response)
     82         user_urls = page.xpath('//h2/a[class="zg-link"]/@href')
     83         
     84         for u_url in user_urls:
     85             userid = u_url.split('/')[-1]
     86             # print "WWWWWWWW*****",userid
     87             if self.not_in(userid):
     88                 self.r.set(userid, True)
     89         if followee > 20:
     90             self.doprofiles(followee,followee_url)
     91 
     92     # 动态获取“更多”里面的内容
     93     def doprofiles(self,attention,url):
     94         thisheader = {
     95             'Host': 'www.zhihu.com',
     96             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)
     97              AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.2357.124
     98               Safari/537.36',
     99             'Accept': '*/*',
    100             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    101             'Accept-Encoding': 'gzip, deflate',
    102             'Content-Type':'application/x-www-form-urlencoded;
    103                                charset=UTF-8',
    104             'X-Requested-With': 'XMLHttpRequest',
    105             'Pragma': 'no-cache',
    106             'Cache-Control': 'no-cache',
    107             'Referer': url,
    108             'Content-Length': 171,
    109             'Cookie': '填自己的'
    110         }
    111         hash_id = '填自己的'
    112         xsrf = '填自己的'
    113         # 计算页数,获取更多里面的关注者信息
    114         pages = attention/20 + 1
    115         # if pages > 600:
    116         #     pages = 600
    117         for x in xrange(1,pages):
    118             offset = x * 20
    119             params = json.dumps({"offset":offset,
    120                 "order_by":"created",
    121                 "hash_id":hash_id})
    122             payload = {"method":"next","params":params,"_xsrf":xsrf}
    123             content = self.session.post("http://www.zhihu.com/node/
    124             ProfileFollowersListV2",headers = thisheader
    125             ,data = payload).content
    126             load = json.loads(content)
    127             # print type(load)
    128             lists = load['msg']
    129             for item in lists:
    130                 try:
    131                     userpeople = re.search(r'people/[w+d+-]+',item)
    132                     # print userpeople
    133                     if userpeople is not None:
    134                         people = userpeople.group()
    135                         userid = people.split('/')[-1]
    136                         print "PPPPPPPPPPPPPP-------",userid
    137                         if self.not_in(userid):
    138                             self.r.set(userid, True)
    139                 except AttributeError:
    140                     print "ERROR"
    141                 # self.num += 1
    142         self.gofollow()
    143 
    144 
    145     # 继续Follow
    146     def gofollow(self):
    147         for key in self.r.keys():
    148             if self.r.get(key) == 'True':
    149                 self.follow(key)
    150 
    151     # 检查用户名是否在redis里已经存在
    152     def not_in(self, userid):
    153         if self.r.exists(userid):
    154             return False
    155         else:
    156             return True
    157 
    158     def getinfo(self,userid):
    159         follower_url = "http://www.zhihu.com/people/"+userid+"/followers"
    160         response = self.session.get(follower_url, cookies=self.cookies)
    161             .content
    162         page = etree.HTML(response)
    163         user_urls = page.xpath('//h2/a[@class="zg-link"]/@href')
    164         # 获取姓名、城市、工作、性别、教育等信息,并存入数据库
    165         followee = int(page.xpath('//div[@class="zm-profile-side-
    166         following zg-clear"]/a[1]/strong/text()')[0])
    167         follower = int(page.xpath('//div[@class="zm-profile-side-
    168         following zg-clear"]/a[2]/strong/text()')[0])
    169         return follower, followee, user_urls
    170 
    171 if __name__ == '__main__':
    172     zhihu = ZhihuSpider()
    173     # 创建表
    174     zhihu.cur.execute('''create table if not exists userstb
    175         (userid text primary key,
    176         username text, gender text, followee integer,
    177         follower integer, location text,
    178         business text, employment text, 
    179         position text, education text, college text, 
    180         question_num integer, answer_num text)''')
    181     zhihu.conn.commit()
    182     zhihu.create_session()
    183 
    184     # 几个知乎大V
    185     first_users = ['excited-vczh', 'warfalcon','gejinyuban']
    186     for user in first_users:
    187         if zhihu.r.exists(user):
    188             continue
    189         else:
    190             zhihu.follow(user)
    191     # 从redis里面查找没有被follow的用户id
    192     for key in zhihu.r.keys():
    193         if zhihu.r.exists(key):
    194             if zhihu.r.get(key)=='True':
    195                 zhihu.follow(key)

    get-info.py 主要访问每个id的主页,提取信息

      1 # -*- coding: utf-8 -*-
      2 '''
      3 网络爬虫之爬取知乎用户信息
      4 '''
      5 import requests, json, re, redis, sqlite3
      6 import ConfigParser
      7 from lxml import etree
      8 from time import ctime
      9 import sys
     10 reload(sys)
     11 sys.setdefaultencoding("utf-8")
     12 
     13 class GetInfo(object):
     14     """docstring for GetInfo"""
     15 
     16     r1 = redis.Redis(host='127.0.0.1',port=6379,db=1)
     17     r2 = redis.Redis(host='127.0.0.1',port=6379,db=2)
     18     cf = ConfigParser.ConfigParser()
     19     cf.read('config.ini')
     20     cookies = cf.items('cookies')
     21     cookies = dict(cookies)
     22     session = requests.session()
     23     conn = sqlite3.connect('zhihuuser.db')
     24     cur = conn.cursor()
     25     itemlist = []
     26     useridlist = []
     27     flag = 0
     28 
     29     # 请求用户主页,获取信息,并存入数据库
     30     def getinfo(self, userid):
     31         url = "http://www.zhihu.com/people/"+userid
     32         print "GET:%s---%s" %(userid,ctime())
     33 
     34         # 异常处理,必要!!
     35         try:
     36             response = self.session.get(url,cookies=self.cookies).content
     37             page = etree.HTML(response)
     38             username = page.xpath('//div[@class="title-section ellipsis"]
     39             /span[@class="name"]/text()')[0]
     40             location = page.xpath('//div[@data-name="location"]/span
     41             /span[@class="location item"]/@title')
     42             business = page.xpath('//div[@data-name="location"]/span
     43             /span[@class="business item"]/@title')
     44             gendertit = page.xpath('//div[@data-name="location"]/span
     45             /span[@class="item gender"]/i/@class')
     46             # 没办法直接取出性别,曲线救国
     47             if len(gendertit)==0:
     48                 gender = 'notsure'
     49             elif re.search(r'female', gendertit[0]):
     50                 gender = u''
     51             else:
     52                 gender = u''
     53             employment = page.xpath('//div[@data-name="employment"]
     54                 /span/span[@class="employment item"]/@title')
     55             position = page.xpath('//div[@data-name="employment"]
     56                 /span/span[@class="position item"]/@title')
     57             education = page.xpath('//div[@data-name="education"]
     58                 /span/span[@class="education item"]/@title')
     59             college = page.xpath('//div[@data-name="education"]
     60                 /span/span[@class="education-extra item"]/@title')
     61             followee = int(page.xpath('//div[@class="zm-profile-side-
     62                 following zg-clear"]/a[1]/strong/text()')[0])
     63             follower = int(page.xpath('//div[@class="zm-profile-side-
     64                 following zg-clear"]/a[2]/strong/text()')[0])
     65             question_num = int(page.xpath('//div[@class="profile-navba
     66             r clearfix"]/a[2]/span/text()')[0])
     67             answer_num =int(page.xpath('//div[@class="profile-navbar
     68              clearfix"]/a[3]/span/text()')[0])
     69 
     70             # 有些字段用户没有填写,所以需要判断是否为空
     71             if len(location) == 0:
     72                 location = None
     73             else:
     74                 location = location[0]
     75             if len(business) == 0:
     76                 business = None
     77             else:
     78                 business = business[0]
     79             if len(employment) == 0:
     80                 employment = None
     81             else:
     82                 employment = employment[0]
     83             if len(position) == 0:
     84                 position = None
     85             else:
     86                 position = position[0]
     87             if len(education) == 0:
     88                 education = None
     89             else:
     90                 education = education[0]
     91             if len(college) == 0:
     92                 college = None
     93             else:
     94                 college = college[0]
     95 
     96             # 存入数据库并提交
     97             item = (userid,username,gender,followee,follower,
     98             location,business,employment,position,education,
     99             college,question_num,answer_num)
    100             print userid,username
    101             has_in = self.cur.execute("insert into userstb 
    102                 values(?,?,?,?,?,?,?,?,?,?,?,?,?)",item)
    103             self.conn.commit()
    104             if has_in:
    105                 print u"存入成功"
    106                 self.r2.set(userid,True)
    107             else:
    108                 print u"存入失败"
    109         except requests.exceptions.RequestException:
    110             print u'连接异常'
    111             self.main()
    112         except Exception:
    113             self.r2.set(userid,True)
    114             self.main()
    115 
    116     # 主循环,从redis里面取出没有查询过的
    117     def main(self):
    118         while True:
    119             for key in self.r1.keys():
    120                 if self.r2.exists(key):
    121                     continue
    122                 else:
    123                     self.getinfo(key)
    124 
    125 if __name__ == '__main__':
    126     begin = GetInfo()
    127     begin.main()

    GG

  • 相关阅读:
    小熊派接入华为IOT
    VS2022 C++ 支持热重载
    Go入门笔记43HGet查询
    Go入门笔记45在WSL2上测试串口编程
    Yarn全新安装
    EdgexGo2.0学习19 no secty依然提示让输入token
    Ubuntu20.04安装Emqx
    shell脚本中echo显示内容带颜色
    EdgexGo2.0学习20 编译EdgeX Go UI
    EdgexGo2.0学习18 消息总线目标
  • 原文地址:https://www.cnblogs.com/phil-chow/p/5347458.html
Copyright © 2011-2022 走看看