python 简书用户爬虫 广度优先
1 # python 2 # -*- coding: utf-8 -*- 3 """ 4 __title__ = '' 5 __author__ = 'wlc' 6 __mtime__ = '2017/10/15' 7 """ 8 import re 9 import time 10 import math 11 import csv 12 import requests 13 from bs4 import BeautifulSoup 14 from collections import deque 15 import sys 16 #python 默认递归限制为900 17 sys.setrecursionlimit(10000) 18 19 #建立一个csv文件保存信息 20 path = 'dataCollection/userInfo.csv' 21 csvFile = open(path, 'a+', newline='', encoding='utf-8') 22 writer = csv.writer(csvFile) 23 writer.writerow(('id','name','following','follower','article','word','like')) 24 25 #全局变量用来存储userid 和关注的人数 26 idContainer = set() 27 #用来放置用户的链接使用双向队列 28 linkDeque = deque() 29 30 class jianshu(object): 31 def __init__(self): 32 #定制url模板 33 self.url = 'http://www.jianshu.com/users/{userId}/following?page={page}' 34 #用户id与name的匹配规则 35 self.idPattern = re.compile('<a class="name" href="/u/(.*?)">(.*?)</a>') 36 #用户的关注 粉丝 文章 文集 的匹配规则 37 self.metalPattern = re.compile('<span>关注 (d+)</span><span>粉丝 (d+)</span><span>文章 (d+)</span>') 38 self.meta = re.compile('写了 (d+) 字,获得了 (d+) 个喜欢') 39 #伪装成浏览器 40 self.header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} 41 42 def createRequest(self, userId, page): 43 url = self.url.format(userId = userId, page = page) 44 requ = requests.get(url, headers = self.header).text 45 return requ 46 47 def pageResponse(self, requ): 48 bsOBJ = BeautifulSoup(requ, 'lxml') 49 userContainer = bsOBJ.find_all('ul',{'class':'user-list'})[0] 50 userContent = userContainer.contents 51 userContent = [str(user) for user in userContent if user != ' '] 52 #关注用户列表 53 return userContent 54 55 def parserUserInfo(self, user): 56 id, name = re.findall(self.idPattern, user)[0] 57 followingNum, followerNum, articleNum = re.findall(self.metalPattern, user)[0] 58 try: 59 wordNum, likeNum = re.findall(self.meta, user)[0] 60 except: 61 wordNum, likeNum = 0, 0 62 content = (id, name, followingNum, followerNum, articleNum, wordNum, likeNum) 63 writer.writerow(content) 64 return content 65 66 def getUserList(self, userId, following): 67 idContainer.add((userId, following)) 68 num = int(following) / 10 69 page = math.ceil(num) 70 for pg in range(1, page + 1, 1): 71 requ = self.createRequest(userId, pg) 72 userList = self.pageResponse(requ) 73 for user in userList: 74 content = self.parserUserInfo(user) 75 linkDeque.append((content[0], content[2])) 76 time.sleep(1) 77 for deq in linkDeque: 78 if deq not in idContainer: 79 self.getUserList(deq[0],deq[1]) 80 print("what") 81 jianshu = jianshu().getUserList('1562c7f16a04',45)