zoukankan html css js c++ java

python 简书用户爬虫

python 简书用户爬虫广度优先

 1 # python
 2 # -*- coding: utf-8 -*-
 3 """
 4 __title__ = ''
 5 __author__ = 'wlc'
 6 __mtime__ = '2017/10/15'
 7 """
 8 import re
 9 import time
10 import math
11 import csv
12 import requests
13 from bs4 import BeautifulSoup
14 from collections import deque
15 import sys
16 #python 默认递归限制为900
17 sys.setrecursionlimit(10000)
18 
19 #建立一个csv文件保存信息
20 path = 'dataCollection/userInfo.csv'
21 csvFile = open(path, 'a+', newline='', encoding='utf-8')
22 writer = csv.writer(csvFile)
23 writer.writerow(('id','name','following','follower','article','word','like'))
24 
25 #全局变量用来存储userid 和关注的人数
26 idContainer = set()
27 #用来放置用户的链接使用双向队列
28 linkDeque  = deque()
29 
30 class jianshu(object):
31     def __init__(self):
32         #定制url模板
33         self.url = 'http://www.jianshu.com/users/{userId}/following?page={page}'
34         #用户id与name的匹配规则
35         self.idPattern = re.compile('<a class="name" href="/u/(.*?)">(.*?)</a>')
36         #用户的关注 粉丝 文章 文集 的匹配规则
37         self.metalPattern = re.compile('<span>关注 (d+)</span><span>粉丝 (d+)</span><span>文章 (d+)</span>')
38         self.meta = re.compile('写了 (d+) 字，获得了 (d+) 个喜欢')
39         #伪装成浏览器
40         self.header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
41 
42     def createRequest(self, userId, page):
43         url = self.url.format(userId = userId, page = page)
44         requ = requests.get(url, headers = self.header).text
45         return requ
46 
47     def pageResponse(self, requ):
48         bsOBJ = BeautifulSoup(requ, 'lxml')
49         userContainer = bsOBJ.find_all('ul',{'class':'user-list'})[0]
50         userContent = userContainer.contents
51         userContent = [str(user) for user in userContent if user != '
']
52         #关注用户列表
53         return userContent
54 
55     def parserUserInfo(self, user):
56         id, name = re.findall(self.idPattern, user)[0]
57         followingNum, followerNum, articleNum = re.findall(self.metalPattern, user)[0]
58         try:
59             wordNum, likeNum = re.findall(self.meta, user)[0]
60         except:
61             wordNum, likeNum = 0, 0
62         content = (id, name, followingNum, followerNum, articleNum, wordNum, likeNum)
63         writer.writerow(content)
64         return  content
65 
66     def getUserList(self, userId, following):
67         idContainer.add((userId, following))
68         num = int(following) / 10
69         page = math.ceil(num)
70         for pg in range(1, page + 1, 1):
71             requ = self.createRequest(userId, pg)
72             userList = self.pageResponse(requ)
73             for user in userList:
74                 content = self.parserUserInfo(user)
75                 linkDeque.append((content[0], content[2]))
76             time.sleep(1)
77         for deq in linkDeque:
78             if deq not in idContainer:
79                 self.getUserList(deq[0],deq[1])
80                 print("what")
81 jianshu = jianshu().getUserList('1562c7f16a04',45)

查看全文

相关阅读:
[Day01] Python基础
 Python数据结构与循环语句
 elementui级联下拉框怎么设置可选择任意一级选项以及设置后前面会出现1个单选按钮去掉单选按钮的方法和选好后下拉面板不自动收起的问题
 解决ElementUI中的Cascader 级联选择器高度过高的问题
 git切换分支提示：you need to resolve your current index first
提交本地代码到git远程仓库时误操作让git代码覆盖了本地代码，找回本地代码的解决方法
 elementui在表格/下来列表等展示数据的区显示加载中
 vueshop
elementui表单验证无效的解决方法
 elementui滑块开启和关闭状态动态绑定

原文地址：https://www.cnblogs.com/wlc297984368/p/7673777.html